B
    λd                @   s  d Z ddlZddlZddlZddlmZmZmZmZmZm	Z	m
Z
mZ ddlmZ ddlZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ ddlmZ dd
lmZ ddlmZ ddl m!Z! ddl m"Z" ddl m#Z# ddl m$Z$ ddl m%Z% ddl m&Z' ddl m(Z( ddl m)Z) ddl m*Z* ddl m+Z+ ddl m,Z, ddl m-Z- ddl.m/Z/ ddl.m0Z0 ddl.m1Z1 ddl.m2Z2 ddl.m3Z3 ddl4m5Z6 ddl4m7Z7 ddl4m8Z8 dd l4m9Z9 dd!l:m;Z; dd"l<m=Z> dd#l?m@Z@ dd$l?mAZA dd%l?mBZB dd&l?mCZC dd'l?mDZD dd(lEmFZF e,Gd) eHd*gZIeHd+d,d-d.d/d0d1d2d3g	ZJeHd*d4gZKd5ZLd6ZMd7ZNd8ZOd9ZPd:ZQe9j=Z=ee	 e	d;d<d=ZReFd>gd?deejS ee	 eTeeT e>jUdAdBdCZVdejSee	 e,jWdDdEdFZXeFdGgd?dee	 e,jWd;dHdIZYe
eef dJdKdLZZeeTdMdNdOZ[ee6j\ dJdPdQZ]e!j^e,jWeBj_eBj_eTdRdSdTZ`G dUdV dVe0jaZbG dWdX dXe0jcZdeFdYgd?edZef ed[d\d]ZeeFd^gd?G d_d` d`ejfZgeFdaG dbdc dcehdcdddegZieFdfgd?eCjjdedZef eeee>jU   ee7jk ee6j\ ee	 ee eeg eei ee dg	dhdiZldjdk Zmee>jU eee-jn  ege
eee  eejo f dldmdnZpddodpZqdedZef eeee>jU   ee7jk ee6j\ ee	 eTee eeg eei eee>jU  dq
drdsZreeTe
eee>jU  ee,jW ee f dtdudvZseeTe
eee>jU  ee,jW ee f dtdwdxZtdedZef eeeee>jU    eueeeu  eeTeeT f eeeu  ee7jk ee6j\ ee	 eei e
e,jWee>jU f dzd{d|ZveFd}gd?eCjjdedZef eee>jU  eueeeu  eeTeeT f eeeu  ee7jk ee6j\ ee	 eei ee>jU dzd~dZweFdgd?eCjjdedZef eeeee>jU    euee7jk ee6j\ ee	 eei dddZxeFdgd?eCjjdedZef eeeee>jU    ee7jk ee6j\ ee	 eei edddZyeHddddd4ddgZzeTdJddZ{G dd de0jaZ|e,j}dddZ~dedZef eee>jU  ee7jk ee6j\ ee	 ee>jU dddZe,j}dddZdS )z Library of TPU helper functions.    N)AnyCallableIterableListOptionalTextTupleUnion)logging)xla)attr_value_pb2)dynamic_padding_pb2)tpu_embedding_configuration_pb2)tf2)device_util)distribution_strategy_context)auto_control_deps)
c_api_util)composite_tensor)config)constant_op)device)dtypes)errors)
func_graph)function)ops)tensor_shape)	array_ops)control_flow_ops)math_ops)variable_scope)	variables)device_assignment)tpu_feed)tpu_function)tpu_name_util)tpu_ops)core)compat)nest)object_identity)traceback_utils)variable_utils)	tf_exportTPUReplicatedInputZPlaceholderZAudioSummaryZAudioSummaryV2ZHistogramSummaryZImageSummaryZMergeSummaryZPrintZScalarSummaryZTensorSummaryZTensorSummaryV2ZVarHandleOp   Z_tpu_replicateZ_post_device_rewriteZ_tpu_compilation_status_xla_outside_compilationZ_pivot_for_cluster)jobreturnc             C   s   | dkrdS d|  S dS )z;Returns the device name for the TPU_SYSTEM device of `job`.Nz/device:TPU_SYSTEM:0z/job:%s/device:TPU_SYSTEM:0 )r2   r4   r4   K/var/www/html/venv/lib/python3.7/site-packages/tensorflow/python/tpu/tpu.py_tpu_system_device_nameg   s    r6   ztpu.initialize_system)Zv1T)embedding_configr2    compilation_failure_closes_chipstpu_cancellation_closes_chipsr3   c          
   C   s   | dkrdn|   }d}|dk	r.|r*d}nd}tt|f tj||d}| dkrX|S t|g tj|d}W dQ R X t|g tj	|dd	S Q R X W dQ R X dS )
a  Initializes a distributed TPU system for use with TensorFlow.

  Args:
    embedding_config: If not None, a `TPUEmbeddingConfiguration` proto
      describing the desired configuration of the hardware embedding lookup
      tables. If embedding_config is None, no hardware embeddings can be used.
    job: The job (the XXX in TensorFlow device specification /job:XXX) that
      contains the TPU devices that will be initialized. If job=None it is
      assumed there is only one job in the TensorFlow flock, and an error will
      be returned if this assumption does not hold.
    compilation_failure_closes_chips: Set the configuration whether
      we want to close TPU chips when there is a compilation failure.
    tpu_cancellation_closes_chips: Set the configuration whether
      we want to close TPU chips when a TPU execution is cancelled. If the value
      is None, the behavior will be determined by the command line flag
      `tpu_cancellation_closes_chips` for the TPU worker. WARNING: this argument
      only applies to TFRT TPU runtime.
  Returns:
    A serialized `TopologyProto` that describes the TPU system. Note:
      the topology must be evaluated using `Session.run` before it can be used.
  N r         )r8   r9   )r   Ztpu_init_identity)name)
SerializeToStringr   r   r6   r'   Zconfigure_distributed_tpucontrol_dependenciesconfigure_tpu_embeddingr   identity)r7   r2   r8   r9   config_stringZ"tpu_cancellation_closes_chips_enumtopologyZembedding_initr4   r4   r5   initialize_systemo   s"    rD   )r7   r2   r3   c          	   C   s.   |   }tt| tj|dS Q R X dS )aV  Initializes a distributed TPU Embedding system for use with TensorFlow.

  The following two are equivalent:
  1. initialize_system() with embedding_config.
  2. initialize_system() without embedding_config, then
     initialize_system_for_tpu_embedding().
  initialize_system() should not be called with embedding_config if
  initialize_system_for_tpu_embedding() is meant to be called later.

  Args:
    embedding_config: a `TPUEmbeddingConfiguration` proto describing the desired
      configuration of the hardware embedding lookup tables.
    job: The job (the XXX in TensorFlow device specification /job:XXX) that
      contains the TPU devices that will be initialized. If job=None it is
      assumed there is only one job in the TensorFlow flock, and an error will
      be returned if this assumption does not hold.

  Returns:
    A no-op.
  )r   N)r>   r   r   r6   r'   r@   )r7   r2   rB   r4   r4   r5   #initialize_system_for_tpu_embedding   s    rE   ztpu.shutdown_systemc          	   C   s&   t t|  t }W dQ R X |S )aQ  Shuts down a running a distributed TPU system.

  Args:
    job: The job (the XXX in TensorFlow device specification /job:XXX) that
      contains the TPU devices that will be shutdown. If job=None it is
      assumed there is only one job in the TensorFlow flock, and an error will
      be returned if this assumption does not hold.
  N)r   r   r6   r'   shutdown_distributed_tpu)r2   rF   r4   r4   r5   shutdown_system   s    
rG   )r3   c              C   s\   t  } xF| dk	rN|  }x$|dk	r>t|tr6|| fS |j}qW t| dd} q
W tddS )z9Returns the TPUReplicateContext and its associated graph.Nouter_graphziget_replicated_var_handle() called without TPUReplicateContext. This shouldn't happen. Please file a bug.)r   get_default_graph_get_control_flow_context
isinstanceTPUReplicateContextouter_contextgetattr
ValueError)graphZcontext_r4   r4   r5    _enclosing_tpu_context_and_graph   s    



rQ   )strategyr3   c             C   s&   dd }| j }||p$tt||jS )Nc             S   s   | j dS )NZTPUStrategy)__name__
startswith)kr4   r4   r5   <lambda>       z!is_tpu_strategy.<locals>.<lambda>)	__class__anymap	__bases__)rR   Zis_tpu_stratZclzr4   r4   r5   is_tpu_strategy   s    r\   c              C   s(   t  sd S t  } t| s d S | jjS )N)r   Zhas_strategyZget_strategyr\   extendedZ_device_assignment)rR   r4   r4   r5    _enclosing_tpu_device_assignment   s    r^   )opresource_readsresource_writesr3   c             C   sF   | j dkr*|s|r&|  |  dS dS dd }t||pB||S )zGReplaces TPUReplicatedInput outputs with its inputs in resource_inputs.r/   TFc             S   sd   g }g }x0| D ](}|j jdkr|| ||j j qW x|D ]}| | q@W | | |pb|S )zEReplaces handles in `resource_inputs` with their unreplicated inputs.r/   )r_   typeappendextendinputsdiscardupdate)Zresource_inputsZ	to_removeZto_addresourcetr4   r4   r5   #replace_with_unreplicated_resources	  s    



zJtpu_replicated_input_resolver.<locals>.replace_with_unreplicated_resources)rb   clearbool)r_   r`   ra   rj   r4   r4   r5   tpu_replicated_input_resolver   s    

rm   c                   sZ  e Zd ZdZeeejd fddZd.eee	e
ej e
ej f eeejdddZd	d
ddZejedddZejedddZd/ee dddZdd Zd	d
 fddZe
e d
ddZejee
ej e
ej f dddZejd	dddZejejd d!d"Zejd#d$d%Zed&d' Zed(d) Z ejd
d*d+Z!d,d- Z"  Z#S )0rL   ah  A `ControlFlowContext` for nodes inside a TPU computation.

  The primary role of `TPUReplicateContext` is to mark operators inside a
  tpu.replicate() computation with the attribute "_tpu_replicate=XYZ", where XYZ
  is a unique name.

  We use a `ControlFlowContext` to perform the annotation since it integrates
  with Tensorflow constructs like ResourceVariables. For example, if a
  `ResourceVariable` is constructed inside a tpu.replicate() block, the
  `ResourceVariable` implementation can use
  `with ops.control_dependencies(None)` to build the variable's definition
  outside the replicated computation.
  )r=   num_replicaspivotc                s   t t|   || _d| _d| _d| _d| _d| _d| _	g | _
g | _|| _t|| _ttj| jd | _g | _|| _i | _dS )a  Builds a new TPUReplicateContext.

    Args:
      name: a unique name for the context, used to populate the `_tpu_replicate`
        attribute.
      num_replicas: an integer that gives the number of replicas for the
        computation.
      pivot: a pivot node. Nodes in the TPUReplicateContext that do not have any
        inputs will have a control dependency on the pivot node. This ensures
        that nodes are correctly included in any enclosing control flow
        contexts.
    Nr   )s)superrL   __init___num_replicas_outer_device_function_stack_oc_dev_fn_stack_outside_compilation_cluster_outside_compilation_v2_context_outside_compilation_counter_in_gradient_colocation_gradient_colocation_stack_host_compute_core_namer)   as_bytesZ_name_as_bytesr   ZScopedTFBufferr   	AttrValuer>   _tpu_relicate_attr_buf_unsupported_ops_pivot_replicated_vars)selfr=   rn   ro   )rX   r4   r5   rr   )  s"    zTPUReplicateContext.__init__F)r=   	handle_idvars_is_mirrored	is_packedr3   c          	   C   s8  t  }| j|}|dk	r|S |dk	r|stj|d jj}dd |D }	g }
xht|j	D ]T}xNt|j
D ]2}t|j|||d}||	krl|
|	|  P qlW td|q\W n|}
t \}}| X t|
d tjrdd |
D }
| }|| j tj|
|d	 ||d
}|| W dQ R X || j|< |S )a  Returns a variable handle for replicated TPU variable 'var'.

    This is a method used by an experimental replicated variable implementation
    and is not intended as a public API.

    Args:
      name: The common name of the variable.
      handle_id: Unique ID of the variable handle, used as the cache key.
      vars_: The replicated TPU variables or handles.
      is_mirrored: Whether the variables are mirrored, which guarantees the
        values in each replica are always the same.
      is_packed: Whether the replicated variables are packed into one variable.

    Returns:
      The handle of the TPU replicated input node.
    Nr   c             S   s   i | ]}|t |jqS r4   )r   canonicalizer   ).0vr4   r4   r5   
<dictcomp>k  s    zATPUReplicateContext.get_replicated_var_handle.<locals>.<dictcomp>)replicalogical_corer2   zSFailed to find a variable on any device in replica {} for current device assignmentc             S   s   g | ]
}|j qS r4   )handle)r   r   r4   r4   r5   
<listcomp>  s    zATPUReplicateContext.get_replicated_var_handle.<locals>.<listcomp>z/handle)r=   Zis_mirrored_variabler   )r^   r   getpydev
DeviceSpecfrom_stringr   r2   rangern   num_cores_per_replicar   r   Z
tpu_devicerc   rO   formatrQ   Z
as_defaultrK   r"   VariablerJ   Z_set_control_flow_contextrM   r'   tpu_replicated_input)r   r=   r   r   r   r   r#   r   Zjob_nameZdevices_to_varsZreplicated_varsZ
replica_idr   r   _rP   Zsaved_contextr4   r4   r5   get_replicated_var_handleH  sB    	


z-TPUReplicateContext.get_replicated_var_handleN)r3   c             C   sb   | j r^ddd | j d t D }tdt| j | t| j tkr^tdt| j t   d S )N
c             s   s   | ]}d |j |jf V  qdS )z	  %s (%s)N)rb   r=   )r   r_   r4   r4   r5   	<genexpr>  s   zDTPUReplicateContext.report_unsupported_operations.<locals>.<genexpr>z$%d unsupported operations found: 
%sz... and %d more)r   join_MAX_WARNING_LINESr
   warninglen)r   Zop_strr4   r4   r5   report_unsupported_operations  s    
z1TPUReplicateContext.report_unsupported_operations)r_   gradient_uidc             C   s   |d k	rt  jd krry|td}W n tk
r>   d S X |d}|d d | }t|| _	| j	
  d S | j| | jsy^|td}| jrtd|dkrtd|| _|d}|d d | }| j|d W n tk
r   Y nX d S )Nascii.r   z>Cannot nest gradient colocation operations outside compilationZ__unsupported__z;No gradient_uid calling gradient within outside_compilation)cluster)r   rI   _control_flow_contextZget_attr_OUTSIDE_COMPILATION_ATTRdecoderO   splitOutsideCompilationV2Contextrw   Enterrz   rc   rv   ry   NotImplementedError_EnterOutsideCompilationScope)r   r_   r   Zoutside_attrpartsr   r4   r4   r5   EnterGradientColocation  s8    


z+TPUReplicateContext.EnterGradientColocationc             C   s   |d k	rt  jd kr(| jd ks$td S | jd k	rF| j  d | _d S | jsdt|j	|d|j
 | j }||kr|| jkrd | _|   nt|j	|d| d|j
 d S )Nz>Badly nested gradient colocation: empty stack when popping Op z+Badly nested gradient colocation, expected z, got )r   rI   r   rw   AssertionErrorExitrz   r   ZInternalErrornode_defr=   popry   _ExitOutsideCompilationScope)r   r_   r   Zlast_opr4   r4   r5   ExitGradientColocation  s(    




z*TPUReplicateContext.ExitGradientColocation)r   c             C   s   G dd dt }| jrtd|r*|| _nt| j| _|  jd7  _t }| }|| tj	
|j}|jdkr|jd k	r| j| jd t|j  |j| _| j|_d S )Nc               @   s@   e Zd ZdZdd Zedd Zedd Zdd	 Zd
d Z	dS )zATPUReplicateContext._EnterOutsideCompilationScope.<locals>.FakeOpzA helper class to determine the current device.

      Supports only the type and device set/get methods needed to run the
      graph's _apply_device_function method.
      c             S   s
   d| _ d S )Nr:   )_device)r   r4   r4   r5   rr     s    zJTPUReplicateContext._EnterOutsideCompilationScope.<locals>.FakeOp.__init__c             S   s   dS )NFakeOpr4   )r   r4   r4   r5   rb     s    zFTPUReplicateContext._EnterOutsideCompilationScope.<locals>.FakeOp.typec             S   s   | j S )N)r   )r   r4   r4   r5   r     s    zHTPUReplicateContext._EnterOutsideCompilationScope.<locals>.FakeOp.devicec             S   s"   t |tjr| | _n|| _d S )N)rK   r   r   Z	to_stringr   )r   r   r4   r4   r5   _set_device  s    zMTPUReplicateContext._EnterOutsideCompilationScope.<locals>.FakeOp._set_devicec             S   s
   || _ d S )N)r   )r   Z
device_strr4   r4   r5   _set_device_from_string  s    zYTPUReplicateContext._EnterOutsideCompilationScope.<locals>.FakeOp._set_device_from_stringN)
rS   
__module____qualname____doc__rr   propertyrb   r   r   r   r4   r4   r4   r5   r     s   r   z(Cannot nest outside_compilation clustersr;   ZTPU_REPLICATED_CORE:)objectrv   r   strrx   r   rI   Z_apply_device_functionsr   r   r   r   Zdevice_typeZdevice_indexr{   rc   _device_function_stackru   rt   )r   r   r   rP   Zfake_opr   r4   r4   r5   r     s"    


z1TPUReplicateContext._EnterOutsideCompilationScopec             C   s(   | j stdd | _ t }| j|_d S )Nz=Attempted to exit outside_compilation scope when not in scope)rv   rO   r   rI   ru   r   )r   rP   r4   r4   r5   r   "  s    z0TPUReplicateContext._ExitOutsideCompilationScopec                s,   | j st }|j | _ tt|   d S )N)rt   r   rI   r   copyrq   rL   r   )r   rP   )rX   r4   r5   r   *  s    zTPUReplicateContext.Enterc             C   s   | j S )N)r{   )r   r4   r4   r5   HostComputeCore4  s    z#TPUReplicateContext.HostComputeCore)r_   r3   c             C   sz   g }g }xV|j D ]L}d}| }x |dk	r@|| kr8d}P |j}q"W |rR|| q|| qW |  || ||fS )z2Remove any external control dependency on this op.FNT)Zcontrol_inputsrJ   _outer_contextrc   Z_remove_all_control_inputs_add_control_inputs)r   r_   internal_control_inputsexternal_control_inputsxZis_internal_opZctxtr4   r4   r5   _RemoveExternalControlEdges7  s     


z/TPUReplicateContext._RemoveExternalControlEdgesc       	   	   C   s  |j tkrtd|j |j |j tkr2| j| tdd |j	D rXt
d|j dt|jjkrd|jjkrtd| d|t| jj | jr|ttjt| jd | jd	ks| js|j| |j| | |\}}|j	s|sJ||   nDxBt t!|j	D ]0}|j	| }| "|}||k	r|#|| qW |rt$%d $ | &  d
d |D }| '  W d Q R X |(| dd |j)D }| }x"|d k	r|j*+| |j,}qW | j,r| j,-| d S )NzlOperation of type %s (%s) is not supported on the TPU. Execution will fail if this op is used in the graph. c             s   s   | ]}|j jV  qd S )N)dtypeZ_is_ref_dtype)r   r   r4   r4   r5   r   [  s    z,TPUReplicateContext.AddOp.<locals>.<genexpr>zQNon-resource Variables are not supported inside TPU computations (operator name: )Z_clonedz)TPU computations cannot be nested on op ()rp   r;   c             S   s$   g | ]}|j rt|j d  jqS )r   )outputsr   rA   r_   )r   r   r4   r4   r5   r     s   z-TPUReplicateContext.AddOp.<locals>.<listcomp>c             S   s   g | ]
}|j qS r4   )r=   )r   r   r4   r4   r5   r     s    ).rb   _DENYLISTED_OPSr
   errorr=   _UNSUPPORTED_OPSr   rc   rY   re   r   _TPU_REPLICATE_ATTRr   attrrO   Z_set_attr_with_bufr   bufferrv   	_set_attrr   r   r~   r)   r}   rs   rP   Zprevent_feedingZprevent_fetchingr   Z_add_control_inputGetControlPivotr   r   AddValueZ_update_inputr   r?   r   r   r   r   _valuesrg   r   
AddInnerOp)	r   r_   r   r   indexr   Zreal_xZoutput_namescontextr4   r4   r5   AddOpQ  sX    






zTPUReplicateContext.AddOp)valr3   c             C   sv   | j s
|S |j| jkr4| j|j}|dkr0|S |S |}| j|j | j rf| j |}| j|j || j|j< |S )zCAdd `val` to the current context and its outer context recursively.N)r   r=   r   Z_external_valuesr   addr   )r   r   resultr4   r4   r5   r     s    zTPUReplicateContext.AddValue)r_   c             C   s    |  | | jr| j| d S )N)r   r   r   )r   r_   r4   r4   r5   r     s    
zTPUReplicateContext.AddInnerOpc             C   s   d S )Nr4   )r   r4   r4   r5   
grad_state  s    zTPUReplicateContext.grad_statec             C   s   |   r|   jS dS )z0Forwards to the enclosing while context, if any.F)ZGetWhileContext	back_prop)r   r4   r4   r5   r     s    
zTPUReplicateContext.back_propc             C   s   | j S )N)r   )r   r4   r4   r5   r     s    z#TPUReplicateContext.GetControlPivotc             C   s   dS )NTr4   )r   r4   r4   r5   RequiresUniqueFunctionRetracing  s    z3TPUReplicateContext.RequiresUniqueFunctionRetracing)FF)N)$rS   r   r   r   r   intr   	Operationrr   r	   r   
core_typesTensorr"   r   rl   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   __classcell__r4   r4   )rX   r5   rL     s(   $ 0H
6/
"NrL   c               @   sL   e Zd ZdZedddZejddddZejddd	d
Z	dddZ
dS )r   zThe context for outside compilation in Tensorflow 2.0.

  Every op added in this context will be assigned an _xla_outside_compilation
  attribute.
  )r=   c             C   s   t j|  || _d S )N)r   ControlFlowContextrr   r|   )r   r=   r4   r4   r5   rr     s    z$OutsideCompilationV2Context.__init__N)r_   r3   c             C   s2   | j r| j | |dtjt| jd d S )Nr1   )rp   )r   r   r   r   r~   r)   r}   r|   )r   r_   r4   r4   r5   r     s    z!OutsideCompilationV2Context.AddOpc             C   s2   | j r| j | |dtjt| jd d S )Nr1   )rp   )r   r   r   r   r~   r)   r}   r|   )r   r_   r4   r4   r5   r     s    z&OutsideCompilationV2Context.AddInnerOpc             C   s   t d S )N)r   )r   Zcontext_defZexport_scoper4   r4   r5   to_control_flow_context_def  s    z7OutsideCompilationV2Context.to_control_flow_context_def)N)rS   r   r   r   r   rr   r   r   r   r   r   r4   r4   r4   r5   r     s
   r   ztpu.outside_compilation.)computationr3   c             O   s"  |dkrg n|}t  }t|tjryt \}}W n$ tk
rV   td | ||S X t	|j
}|j
d |_
t|}|  |dkrg n|}| ||}|  |S | }	|	}
x |
rt|
tr|
  |
j}
qW | ||}| }|	|k	rtd|	}
x$|
rt|
tr|
  |
j}
qW |S )a  Builds part of a computation outside any current TPU replicate scope.

  `tf.tpu.outside_compilation()` is used to run ops in `computation` on CPU
  instead of running on TPU. For example, users can run ops that are not
  supported on TPU's (e.g. tf.summary.write()) by explicitly placing those
  ops on CPU's. Below usage of outside compilation will place ops in
  `computation_with_string_ops` on CPU.

  Example usage:

  ```python
  def computation_with_string_ops(x):
    # strings types are not supported on TPU's and below ops must
    # run on CPU instead.
    output = tf.strings.format('1{}', x)
    return tf.strings.to_number(output)

  def tpu_computation():
    # Expected output is 11.
    output = tf.tpu.outside_compilation(computation_with_string_ops, 1)
  ```

  Outside compilation should be called inside TPUReplicateContext. That is,
  `tf.tpu.outside_compilation()` should be called inside a function that is
  passed to `tpu.split_compile_and_replicate()` -- this is implied when
  outside compilation is invoked inside a function passed to TPUStrategy
  `run()`. If invoked outside of TPUReplicateContext,
  then this simply returns the result of `computation`, and therefore,
  would be a no-op. Note that outside compilation is different from
  `tf.distribute.experimental.TPUStrategy.merge_call()` as logic in
  outside compilation is replicated and executed separately for each
  replica. On the other hand, `merge_call()` requires a `merge_fn`
  to aggregate the inputs from different replicas and is executed only
  once.

  For variables placed in TPU device, which includes variables created inside
  TPUStrategy scope, outside compilation logic must not include variable
  read/write. For variables placed on host, which is the case when variables
  created via TPUEstimator, variable read/write is only allowed if the variable
  is not accessed by any other ops in the TPU computation. Variable read/write
  from outside compilation cluster is not visible from TPU computation and
  vice versa. Therefore, if outside compilation logic contains such host
  variables read/write ops and if the variables are accessed by TPU
  computation as well, then this may lead to deadlock.

  Internally, `tf.tpu.outside_compilation()` adds outside compilation
  attributes to all ops in `computation`. During later graph pass, these
  ops with outside compilation attribute is extracted out and replicated
  into a host-side graph. Inputs to this extract host-side graph is sent
  from TPU computation graph to host graph via a pair of XlaSendToHost and
  XlaRecvFromHost ops. Note that using `tf.tpu.outside_compilation()`
  may result in tensor transfer between TPU and CPU, leading to non-trivial
  performance impact.

  Args:
    computation: A Python function that builds the computation to
      place on the host.
    *args: the positional arguments for the computation.
    **kwargs: the keyword arguments for the computation.

  Returns:
    The Tensors returned by computation.
  NzOutside compilation attempted outside TPUReplicateContext scope. As no enclosing TPUReplicateContext can be found, returning the result of `computation` as is.r;   zYControl-flow context cannot be different at start and end of an outside_compilation scope)r   rI   rK   r   	FuncGraphrQ   rO   r
   r   r   rx   r   r   r   rJ   rL   r   rM   r   r   )r   argskwargsrP   Ztpu_contextr   Zoutside_compilation_nameZoutside_compilation_contextretvalZinitial_contextr   Zfinal_contextr4   r4   r5   outside_compilation  sF    C





r   ztpu.PaddingSpecc               @   s   e Zd ZdZdZdZdS )PaddingSpecz:Represents the type of padding policies for tpu.replicate.r   r;   N)rS   r   r   r   ZAUTOPOWER_OF_TWOr4   r4   r4   r5   r   l  s   r   ztpu.XLAOptionsc                   s"   e Zd ZdZd fdd	Z  ZS )
XLAOptionsa{  XLA compilation options.

  Attributes:
    use_spmd_for_xla_partitioning: Boolean. Whether to use XLA's SPMD
      partitioner instead of MPMD partitioner when compiler partitioning is
      requested.
    enable_xla_dynamic_padder: Boolean. Whether to enable XLA dynamic padder
      infrastructure to handle dynamic shapes inputs inside XLA. True by
      default. Disabling this may cause correctness issues with dynamic shapes
      inputs, as XLA will just assume the inputs are with padded shapes. However
      users can optionally set it to False to improve device time if masking is
      already handled in the user side.
  Tc                s   t t| | ||S )N)rq   r   __new__)clsuse_spmd_for_xla_partitioningenable_xla_dynamic_padder)rX   r4   r5   r     s    zXLAOptions.__new__)TT)rS   r   r   r   r   r   r4   r4   )rX   r5   r   v  s    r   r   r   ztpu.replicate)	r   re   infeed_queuer#   r=   maximum_shapespadding_specxla_optionsr3   c          
   C   s   t | |||||||dd S )a  Builds a graph operator that runs a replicated TPU computation.

  Example for the basic usage that `inputs` has static shape:

  ```python

  def computation(x):
    x = x + 1
    return tf.math.reduce_mean(x)

  x = tf.convert_to_tensor([1., 2., 3.])
  y = tf.convert_to_tensor([4., 5., 6.])
  tf.compat.v1.tpu.replicate(computation, inputs=[[x], [y]])
  ```

  If the `inputs` has dynamic shapes and you would like to automatically
  bucketize the inputs to avoid XLA recompilation. See the advanced example
  below:

  ```python

  def computation(x):
    x = x + 1
    return tf.math.reduce_mean(x)

  # Assume input tensors in two replicas `x` and `y` both have dynamic shape
  # ([None, 2]).
  tf.compat.v1.tpu.replicate(
    computation,
    inputs=[x, y],
    maximum_shapes=[tf.TensorShape([None, None])],
    padding_spec=tf.compat.v1.tpu.PaddingSpec.POWER_OF_TWO)
  ```

  Args:
    computation: A Python function that builds the computation to replicate.
    inputs: A list of lists of input tensors or `None` (equivalent to
      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
      have the same number of inputs. Each input can be a nested structure
      containing values that are convertible to tensors. Note that passing an
      N-dimension list of compatible values will result in a N-dimension list of
      scalar tensors rather than a single Rank-N tensors. If you need different
      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
      of arguments as inputs to computation.
    device_assignment: If not `None`, a `DeviceAssignment` describing the
      mapping between logical cores in the computation with physical cores in
      the TPU topology. Uses a default device assignment if `None`. The
      `DeviceAssignment` may be omitted if each replica of the computation uses
      only one core, and there is either only one replica, or the number of
      replicas is equal to the number of cores in the TPU system.
    name: (Deprecated) Does nothing.
    maximum_shapes: A nested structure of tf.TensorShape representing the shape
      to which the respective component of each input element in each replica
      should be padded. Any unknown dimensions (e.g.
      tf.compat.v1.Dimension(None) in a tf.TensorShape or -1 in a tensor-like
      object) will be padded to the maximum size of that dimension over all
      replicas. The structure of `maximum_shapes` needs to be the same as
      `inputs[0]`.
    padding_spec: An enum specified by `tpu.PaddingSpec`. This describes the
      padding policy when the `inputs` to `tpu.replicate` is dynamic.
      One usage is to enable automatic bucketizing on the inputs by setting the
      value to `tpu.PaddingSpec.POWER_OF_TWO`, which can help to reduce the
      recompilation in the XLA side.
    xla_options: An instance of `tpu.XLAOptions` which indicates the options
      passed to XLA compiler. Use `None` for default options.
  Returns:
    A list of outputs, indexed by `[replica_num]` each output can be a nested
    structure same as what computation() returns with a few exceptions.

    Exceptions include:
      1) None output: a NoOp would be returned which control-depends on
         computation.
      2) Single value output: A tuple containing the value would be returned.
      3) Operation-only outputs: a NoOp would be returned which
         control-depends on computation.
      TODO(b/121383831): Investigate into removing these special cases.

  Raises:
    ValueError: If all replicas do not have equal numbers of input tensors.
    ValueError: If the number of inputs per replica does not match
      the number of formal parameters to `computation`.
    ValueError: If the static `inputs` dimensions don't match with the values
      given in `maximum_shapes`.
    ValueError: If the structure of inputs per replica does not match
      the structure of `maximum_shapes`.
  )r   r   r   r;   )split_compile_and_replicate)r   re   r   r#   r=   r   r   r   r4   r4   r5   	replicate  s    br   c             C   sR   t | tj} t | t |d  }t |}t |d |}t |tj}|S )zCeil input `x` to power of `n`.g      ?)r    castr   Zfloat32logceilpowint32)r   nZlognxr   r4   r4   r5   _ceil_to_pow_of_n  s    
r  )re   padded_shapesr   r3   c          	      sv  g }g }g }xt | D ]\}}xt |D ]\}    }	|dkrp|g  ||	 |tj|	dtd nLx8t |	D ],\}
}|dks||| |
 krzd|| |
< qzW t|	|| ||< t	 }|j
ttjdd || | q(W qW g }x&|D ]}|tjt|dd qW g }g }g }xt | D ]\}}|g  |g  t|d }xt |D ]\} || | }   }	|| }t|| r&|dk	r&xvt |	D ]j\}
}|| |
 r|dkr|d7 }t }||_|
|_||_|| || t||
 tj qW g xt |jD ]\}
}|| |
 rd	}|jdk	rjt|j|}n*t|| |
 |}|tj krt!|d	}d|||
  g}nddg}| q6W   " rt#$t%d fd
d fdd}nt& }|j
ttjdd || | n||   qlW q6W t|}x$t'|D ]}
||
 (||
  qRW ||fS )a  Pad all input tensors given padded_shapes.

  The real shape tensors will be concatenated with the padded original inputs.

  Args:
    inputs: The original inputs.
    padded_shapes: A list of padded shapes for each input. If an entry is None,
      no padding is performed.
    padding_spec: An enum specified by `tpu.PaddingSpec`. This describes the
      padding policy when the `inputs` to `tf.tpu.replicate` is dynamic.
      One usage is to enable automatic bucketizing on the inputs by setting the
      value to `tpu.PaddingSpec.POWER_OF_TWO`, which can help to reduce the
      recompilation in the XLA side.

  Returns:
    The padded inputs and a PaddingMap list which maps the padded input
    dimension to the real shape argument index.
  r   F)r   NT)b)axisr;   r<   c                  s   t  S )N)r   padr4   )input_tensorpaddingsr4   r5   rV   v  rW   z _pad_all_input.<locals>.<lambda>c                  s    S )Nr4   r4   )r  r4   r5   rV   w  rW   ))	enumerate	get_shapeas_listrc   npZ	full_likerl   maxr   shaper_   r   _POST_DEVICE_REWRITE_ATTRr   r~   r    Z
reduce_maxstackr   rY   dynamic_padding
PaddingMap	arg_indexshape_indexpadding_arg_indexr   r   r   Zdimsvaluemaximumr   r   r  Zis_fully_definedr   Zcondconstantr  r   rd   )re   r  r   Zmaximum_static_shapesZneed_paddingZinput_shape_tensorsZcore_idxZinputs_per_coreidxinput_shapeirp   Zreal_input_shaper   Zshapes_per_inputZpadded_inputsZreal_shapespadding_mapsZreal_shape_idxZinput_shape_tensorZpadded_shapepadding_mapZminimum_dynamic_dim_sizeZmax_dim_sizepaddingZpadded_inputrn   r4   )r  r  r5   _pad_all_input  s    








r  c             C   s,   t | tjr(ttj| dd}|f| S |S )ai  For an input, replaced the input by a tuple if the input is composite.

  If `maybe_composite` is not composite, return the parameter
  `non_composite_output` otherwise return a tuple which consists of the value of
  the parameter `composite_output` the same number of times as there are
  components of the composite tensor.

  This is useful for computing a mask when flattening nested data with
  `expand_composites=True`. For example

  ```python
  nest.flatten(data, expand_composites=True)
  ```

  and

  ```python
  nest.flatten(nest.map(
      data, lambda x: _flatten_and_filter_composite(x, False, True)))
  ```

  will have the same length and second will be True if the tensor in the first
  is derived from a expanding a composite tensor.

  Args:
    maybe_composite: A value to test for being a composite tensor.
    non_composite_output: The value to return when `maybe_composite` is not a
      composite.
    composite_output: the value to fill the output tuple with if
      `maybe_composite` is a composite.

  Returns:
    `non_composite_output` or a tuple with multiple copies of
    `composite_output`.
  T)expand_composites)rK   r   ZCompositeTensorr   r*   flatten)Zmaybe_compositeZnon_composite_outputZcomposite_outputZnum_componentsr4   r4   r5   _flatten_and_filter_composite  s    &
r!  )
r   re   r   r#   r=   use_tpur   r   r   r3   c	       3   
      s  ~|dkrg gn|}|pt  }i }	|dk	rN|j |j  d}	|j|	d< t |	d< t rlt	
d t|tstdt| tdd |D rtd	d
d |D  t|}
|
dkrg S x&td|
D ]t|d |  qW t|}dd |D }ttdd |d }g x"|D ]}dd |D  q*W dd d D }t|d }t|}xlt|
D ]`t| |krtd|t| dd  D }||krxtd||qxW t| ||}|dk	r\|dkr(td| ddd |d D  d| n4td| ddd |d D  dd|j d| d}|r|rttdtj|d |dd  td!d tt|d t|D }d"d |D }tjd |dd  }t|||\}|rd#}t	
d$|d  t| d%d&|	d%< |j |	d'< t!" }g }xNtdtd D ]8fd(dt|
D }|t#j$|d)d* q<W t|t%j&r|'d+|j( }n
|'d,}t)j*|d- d*}|+t,t-j.t/0|d. t1||
|d/}zn|2  t#j3f |
|d0|	}t45|
 t!6|gp |rv|j7rvxL|D ]D}||j8 j9}t:;||j8 |j<||j= ||j8< ||j8 >| q.W d1d t?|D }x:t||D ],\} |r| rj@+d2t-j.d#d3 qW d4d t||d D }!tjA|d |!d| d#d5}!|dk	r*|B|
 x|C D ]}"|!|" qW tDE }#|#jF}$|#jGfd6d7}%|#Hd# |#I|% | |! }&|#H|$ |#I t|&}&W dQ R X W dQ R X |j o|dk	o|jdk}'tJ|&}(|(rtK|&|'\}) ntL|&|'\}) tMjNrtO}*ndd8lPmQ}* |*jRS r>tTU r"t	Vd9 n|*R }+|+Wt!" |) |
})|X|) W d|Y  |Z  |[ },X |,rt-. }-|-jj\]d:d |,D  |+d;|- t!6|gF |rt#^ }.|.j@}/t-j.t/0|d.}-|/+t_|- nt)j*d<d*}.W dQ R X |)s|. fd=dt|
D gS d>d t|
D }0xt?|)D ]\}"|"dkrdx t|
D ]}1|0|1 d qHW q,t#j`|"|
d?d*}2t!6 > x6t|
D ]*}1|0|1 tajb|2|1 d@|1f d* qW W dQ R X q,W fdAd|0D }0|.|0gS )Bay  Builds graph operators that runs compilation and replicated computation.

  This is a lower level interface than replicate that returns a separate compile
  and execute output tensor. In the generated graph the compile op feeds into
  the execute op and no additional compilation is incurred when running the
  compile op before the execute op. The compile op returns additional
  information about the compilation but does not return the compiled program.

  Args:
    computation: A Python function that builds the computation to replicate.
    inputs: A list of lists of input tensors or `None` (equivalent to
      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
      have the same number of inputs. Each input can be a nested structure
      containing values that are convertible to tensors. Note that passing an
      N-dimension list of compatible values will result in a N-dimension list of
      scalar tensors rather than a single Rank-N tensors. If you need different
      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
      of arguments as inputs to computation.
    device_assignment: If not `None`, a `DeviceAssignment` describing the
      mapping between logical cores in the computation with physical cores in
      the TPU topology. Uses a default device assignment if `None`. The
      `DeviceAssignment` may be omitted if each replica of the computation uses
      only one core, and there is either only one replica, or the number of
      replicas is equal to the number of cores in the TPU system.
    name: (Deprecated) Does nothing.
    use_tpu: When false, the input `computation` is executed on the XLA CPU/GPU
      backends. Currently, only supports a default placement (computation is
      placed on GPU if one is available, and on CPU if not).
    maximum_shapes: A nested structure of tf.TensorShape representing the shape
      to which the respective component of each input element in each replica
      should be padded. Any unknown dimensions (e.g.
      tf.compat.v1.Dimension(None) in a tf.TensorShape or -1 in a tensor-like
      object) will be padded to the maximum size of that dimension over all
      replicas. The structure of `maximum_shapes` needs to be the same as
      `inputs[0]`.
    padding_spec: An enum specified by `tf.tpu.PaddingSpec`. This describes the
      padding policy when the `inputs` to `tf.tpu.replicate` is dynamic.
      One usage is to enable automatic bucketizing on the inputs by setting the
      value to `tpu.PaddingSpec.POWER_OF_TWO`, which can help to reduce the
      recompilation in the XLA side.
    xla_options: An instance of `tpu.XLAOptions` which indicates the options
      passed to XLA compiler. Use `None` for default options.

  Returns:
    A list of lists with the first list corresponding to the compile op and the
    second a list of output tensors, indexed by `[replica_num][output_num]`.
  Raises:
    ValueError: If all replicas do not have equal numbers of input tensors.
    ValueError: If the number of inputs per replica does not match
      the number of formal parameters to `computation`.
    ValueError: If the static `inputs` dimensions don't match with the values
      given in `maximum_shapes`.
    ValueError: If the structure of inputs per replica does not match
      the structure of `maximum_shapes`.
  N)rC   r#   r   Zallow_soft_placementzfAutomatic outside compilation is enabled. Ops without XLA kernels will be automatically placed on CPU.z@tpu.replicate() inputs must be a list of lists/tuples, received c             s   s   | ]}t |ttf V  qd S )N)rK   listtuple)r   inpr4   r4   r5   r     s    z.split_compile_and_replicate.<locals>.<genexpr>zGtpu.replicate() inputs must be a list of lists/tuples, received types: c             S   s   g | ]}t |qS r4   )rb   )r   r%  r4   r4   r5   r     s    z/split_compile_and_replicate.<locals>.<listcomp>r   r;   c             S   s   g | ]}t j|d dqS )T)r  )r*   r   )r   Zper_replica_inputr4   r4   r5   r   )  s   c             S   s   t | ddS )NFT)r!  )r   r4   r4   r5   rV   /  rW   z-split_compile_and_replicate.<locals>.<lambda>c             S   s(   g | ] }|d krt dnt|qS )Nr   )r   r  r   convert_to_tensor)r   r   r4   r4   r5   r   6  s   c             S   s   g | ]
}|j qS r4   )r   )r   r   r4   r4   r5   r   ;  s    z`Replicas must have the same number of inputs. Replica 0 had {} inputs, replica {} had {} inputs.c             S   s   g | ]
}|j qS r4   )r   )r   r   r4   r4   r5   r   D  s    zdReplicas must have matching input types. Replica 0 had input types {}, replica {} had input types {}zOSupplied computation cannot be called with the specified inputs. You specified z	 inputs: c             S   s   g | ]
}|j qS r4   )r=   )r   r  r4   r4   r5   r   O  s    z, but the computation needs c             S   s   g | ]
}|j qS r4   )r=   )r   r  r4   r4   r5   r   T  s     zand z: additional inputs from infeed, but the computation needs Fz9Dynamic input shapes are not supported with infeed queues)Zcheck_typesc             S   s   g | ]\}}t ||qS r4   )r!  )r   r   yr4   r4   r5   r   l  s   c             S   s"   g | ]}|d k	rt |nd qS )N)r   TensorShape)r   rp   r4   r4   r5   r   p  s   Tz&TPU has inputs with dynamic shapes: %sZstep_marker_locationZSTEP_MARK_AT_ENTRYr   c                s   g | ]} |  qS r4   r4   )r   r   )flat_inputsr  r4   r5   r     s    zinput{})r=   Zcluster_r   z/pivot)rp   )r=   rn   ro   )rn   r"  c             S   s$   g | ]\}}t j|d |dqS )zreplicated_input_{})r=   )r   rA   r   )r   r  r   r4   r4   r5   r     s   Z_tpu_input_identity)r  c             S   s    g | ]\}}|d krd n|qS )Nr4   )r   Z
replicatedr%  r4   r4   r5   r     s    )Z	structureZflat_sequencer  c                sX   | dd}|dk	r*d|d< td||  dkrB| |f||S  | |f||S dS )z)Variables on TPU have a few restrictions.partitionerNz~Partitioned variables are not supported on TPU. Got `partitioner` that is %s for variable %s. Setting `partitioner` to `None`.)r   r
   r   )getterr=   r   r   r+  )saved_custom_getterr4   r5   custom_getter  s    z2split_compile_and_replicate.<locals>.custom_getter)tensor_tracerz<TF API ver >= 2.0 detected. Tensor Tracer v1 is not enabled.c             s   s   | ]}t |V  qd S )N)r)   r}   )r   r   r4   r4   r5   r     s    host_compute_coreZcompilation_statusc                s   g | ]}t j d | dqS )zshard_%d)r=   )r   group)r   r  )control_depsr4   r5   r   .  s   c             S   s   g | ]}g qS r4   r4   )r   r  r4   r4   r5   r   4  s    zoutput{}zoutput_%d_shard_%dc                s   g | ]}t j |d dqS )T)r  )r*   pack_sequence_as)r   Zreplica_outs)pack_templater4   r5   r   M  s   )cr   rC   Z
serializedZcore_assignmentr   tolistr   r   Zget_soft_device_placementr
   inforK   r#  	TypeErrorrb   rY   r   r   r*   Zassert_same_structurer-   Zconvert_variables_to_tensorsZmap_structurerc   rO   r   r   Zcheck_function_argument_countZnumber_of_tuple_elementszipr  rN   r   r   rI   r'   r   r   r   unique_namer=   r   no_opr   _PIVOT_FOR_CLUSTERr   r~   r)   r}   rL   r   Ztpu_replicate_metadatar%   Ztpu_shard_contextr?   r   r  r  tf2xlaZset_dynamic_dimension_sizer  r  	set_shaper  r_   r3  Zset_number_of_shardsZgenerate_dequeue_opr!   get_variable_scopeZuse_resourcer.  Zset_use_resourceset_custom_getterZis_flat_postprocess_flat_outputs_postprocess_non_flat_outputstypingTYPE_CHECKINGr   tensorflow.python.tpur/  ZTensorTracer
is_enabledr   enabledwarnZ	trace_tpuZ
ExitResultr   r   r   rp   rd   Ztpu_compilation_result_TPU_COMPILATION_STATUS_ATTRZtpu_replicated_outputr   rA   )3r   re   r   r#   r=   r"  r   r   r   Zmetadata_kwargsrn   Zflat_inputs_with_nonesZis_compositer%  Zflat_input_typesZinput_arityZflat_input_aritytypesZ	arg_errorZdynamic_shape_inputsZflat_maximum_shapesZunpadded_inputsr  rP   Zflat_replicated_inputsZreplicasZcluster_namero   r   metadatar  r  Z	compositeZcomputation_inputsri   vscopeZsaved_use_resourcer.  r   need_spmd_partitioningZoutputs_is_flatoutput_tensorsr/  ttr0  Z
attr_valueZcompile_statusr_   Zreplicated_outputsr   Zysr4   )r2  r*  r  r4  r-  r5   r     sn   C









(


















.
r   )r   rL  r3   c       	   
      s  | dkrt  } tj| dd}tj| dd} | t f7 } dd  yH|rZ fdd| D } n,ttd	  fd
d| D } W dQ R X W n0 tk
r } zt	d| W dd}~X Y nX dd | D }dd | D }| || krt	dt
|dkr|ddt
|  }g }x|D ]}|dkr0|d n|rbt|}|jdtjdd || nRt|jrt|jntd	0 t|}|jdtjdd || W dQ R X qW |||fS )a  Validates non-flat outputs, add backs device assignments and other attrs.

  Args:
    outputs: Output from `computation` inside `tpu.rewrite`.
    need_spmd_partitioning: Whether XLA SPMD partitioning is needed.

  Returns:
    - Tensors extracted from outputs.
    - Operations extracted from outputs.
    - A pack template for use with nest.pack_sequence_as to pack the tensors.
  NF)r  Tc             S   s   | d krd S t | S )N)r   r&  )r   r4   r4   r5   rV   |  rW   z+_postprocess_flat_outputs.<locals>.<lambda>c                s$   g | ]}t |tjr|n |qS r4   )rK   r   r   )r   o)maybe_convertr4   r5   r     s   z-_postprocess_flat_outputs.<locals>.<listcomp>r   c                s$   g | ]}t |tjr|n |qS r4   )rK   r   r   )r   rO  )rP  r4   r5   r     s   z_TPU function return values must all either be Operations or convertible to Tensors. Got error: c             S   s   g | ]}t |tjr|qS r4   )rK   r   r   )r   rO  r4   r4   r5   r     s    c             S   s   g | ]}t |tjs|qS r4   )rK   r   r   )r   rO  r4   r4   r5   r     s    zYTPU functions must return zero-or more Tensor values followed by zero or more Operations.r;   _tpu_output_identity)r  )r$  r*   r   r   r:  r   r   r(   	ExceptionrO   r   rc   r   rA   r_   r   r   r~   )	r   rL  r4  eZoutput_operationsrM  Znew_output_tensorsri   rO  r4   )rP  r5   r@  T  sH    






r@  c             C   s,  t j| dd}xt|D ]\}}|dkr6d||< qt|tjrTtd|j dyt|}W n2 t	k
r } ztd| dW dd}~X Y nX |rt
|}|jdtjdd t
|||< qt|jr|jntd	4 t
|}|jdtjdd t
|||< W dQ R X qW |g | fS )
a  Validates non-flat outputs, add backs device assignments and other attrs.

  Args:
    outputs: Output from `computation` inside `tpu.rewrite`.
    need_spmd_partitioning: Whether XLA SPMD partitioning is needed.

  Returns:
    - Tensors extracted from outputs.
    - An empty Operations list because Operations are not allowed in non-flat
      outputs.
    - A pack template for use with nest.pack_sequence_as to pack the tensors.
  T)r  Nztpu.rewrite does not support Operation as return value in non-flat output structure. You can set returned Operations as control dependencies of returned Tensors so Operations are triggered when Tensors are evaluated. Operation found: ""z`TPU function return values must all either be Operations or convertible to Tensors. Got error: "rQ  )r  r   )r*   r   r  rK   r   r   rO   r=   r&  rR  r   rA   r_   r   r   r~   r   r(   )r   rL  Zflat_outputsr  rO  rS  r4   r4   r5   rA    s,     

rA  r;   )r   re   
num_shardsinput_shard_axesoutputs_from_all_shardsoutput_shard_axesr   r#   r=   r   r3   c
                s   dkrt d  |dkr"g n|}t|tsBtdt| dd |D }|dkrfdgt| }t|t|krt dt| dt| d	|rć fd
dt||D }
dd t|
 D }n
g g  }t| |||||	d\}}t|d tj	r||d gfS t|d }|dkr&dg| }|t|krNt d| dt| dt|t
rd|g| }|t|krt d| dt| dg }xt||t| D ]l\}}}|r|d j}|dk	o|jdk}||rtt|ntjt||d n||d  qW ||fS )a  Shards `computation` for parallel execution.

  `inputs` must be a list of Tensors or None (equivalent to an empty list), each
  of which has a corresponding split axis (from `input_shard_axes`). Each input
  is split into `num_shards` pieces along the corresponding axis, and
  computation is applied to each shard in parallel.

  Tensors are broadcast to all shards if they are lexically captured by
  `computation`. e.g.,

  x = tf.constant(7)
  def computation():
    return x + 3
  ... = shard(computation, ...)

  If `outputs_from_all_shards` is true, the outputs from all shards of
  `computation` are concatenated back together along their `output_shard_axes`.
  Otherwise, each output is taken from an arbitrary shard.

  Inputs and outputs of the computation must be at least rank-1 Tensors.

  Args:
    computation: A Python function that builds a computation to apply to each
      shard of the input.
    inputs: A list of input tensors or None (equivalent to an empty list). Each
      input tensor has a corresponding shard axes, given by `input_shard_axes`,
      which must have size divisible by `num_shards`.
    num_shards: The number of shards.
    input_shard_axes: A list of dimensions along which to shard `inputs`, or
      `None`. `None` means "shard all inputs along dimension 0". If not `None`,
      there must be one dimension per input.
    outputs_from_all_shards: Boolean or list of boolean. For each output, if
      `True`, outputs from all shards are concatenated along the corresponding
      `output_shard_axes` entry. Otherwise, each output is taken
      from an arbitrary shard. If the argument is a boolean, the argument's
      value is used for each output.
    output_shard_axes: A list of dimensions along which to concatenate the
      outputs of `computation`, or `None`. `None` means "concatenate all outputs
      along dimension 0". If not `None`, there must be one dimension per output.
      Ignored if `outputs_from_all_shards` is False.
    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
      of `computation`.
    device_assignment: If not `None`, a `DeviceAssignment` describing the
      mapping between logical cores in the computation with physical cores in
      the TPU topology. Uses a default device assignment if `None`. The
      `DeviceAssignment` may be omitted if each shard of the computation uses
      only one core, and there is either only one shard, or the number of shards
      is equal to the number of cores in the TPU system.
    name: (Deprecated) Does nothing.
    xla_options: An instance of `tpu.XLAOptions` which indicates the options
      passed to XLA compiler. Use `None` for default options.
  Returns:
    A tuple of (compile op, [output tensors]).
  Raises:
    ValueError: If num_shards <= 0
    ValueError: If len(input_shard_axes) != len(inputs)
    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
  r   z0num_shards must be a positive integer. Received NzAtpu.shard()'s inputs must be a list of Tensors or None. Received c             S   s   g | ]}t |qS r4   )r   r&  )r   r   r4   r4   r5   r   G  s    z+split_compile_and_shard.<locals>.<listcomp>zKLength of input_shard_axes must be equal to the number of inputs. Received z inputs and z input_shard_axes.c                s    g | ]\}}t j| |d qS ))r  )r   r   )r   r  r   )rU  r4   r5   r   T  s   c             S   s   g | ]}t |qS r4   )r#  )r   r  r4   r4   r5   r   X  s    )r   r#   r=   r   zMLength of output_shard_axes must be equal to the number of outputs. Received z outputs and z output_shard_axes.zSLength of outputs_from_all_shards must be equal to the number of outputs. Received z outputs  and z outputs_from_all_shards.)r  )rO   rK   r#  r7  rb   r   r8  r   r   r   rl   r  Zndimsrc   r   r  concat)r   re   rU  rV  rW  rX  r   r#   r=   r   Zsplit_inputsZtransposed_inputsZ
compile_opr   Znum_outputsresultsr  Z
all_shardsr   r  Z	is_scalarr4   )rU  r5   split_compile_and_shard  sZ    I






r[  z	tpu.shardc
       
      C   s    t | |||||||||	d
d S )a  Shards `computation` for parallel execution.

  `inputs` must be a list of Tensors or None (equivalent to an empty list), each
  of which has a corresponding split axis (from `input_shard_axes`). Each input
  is split into `num_shards` pieces along the corresponding axis, and
  computation is applied to each shard in parallel.

  Tensors are broadcast to all shards if they are lexically captured by
  `computation`. e.g.,

  x = tf.constant(7)
  def computation():
    return x + 3
  ... = shard(computation, ...)

  TODO(phawkins): consider adding support for broadcasting Tensors passed
  as inputs.

  If `outputs_from_all_shards` is true, the outputs from all shards of
  `computation` are concatenated back together along their `output_shard_axes`.
  Otherwise, each output is taken from an arbitrary shard.

  Inputs and outputs of the computation must be at least rank-1 Tensors.

  Args:
    computation: A Python function that builds a computation to apply to each
      shard of the input.
    inputs: A list of input tensors or None (equivalent to an empty list). Each
      input tensor has a corresponding shard axes, given by `input_shard_axes`,
      which must have size divisible by `num_shards`.
    num_shards: The number of shards.
    input_shard_axes: A list of dimensions along which to shard `inputs`, or
      `None`. `None` means "shard all inputs along dimension 0". If not `None`,
      there must be one dimension per input.
    outputs_from_all_shards: Boolean or list of boolean. For each output, if
      `True`, outputs from all shards are concatenated along the corresponding
      `output_shard_axes` entry. Otherwise, each output is taken
      from an arbitrary shard. If the argument is a boolean, the argument's
      value is used for each output.
    output_shard_axes: A list of dimensions along which to concatenate the
      outputs of `computation`, or `None`. `None` means "concatenate all outputs
      along dimension 0". If not `None`, there must be one dimension per output.
      Ignored if `outputs_from_all_shards` is False.
    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
      of `computation`.
    device_assignment: If not `None`, a `DeviceAssignment` describing the
      mapping between logical cores in the computation with physical cores in
      the TPU topology. Uses a default device assignment if `None`. The
      `DeviceAssignment` may be omitted if each shard of the computation uses
      only one core, and there is either only one shard, or the number of shards
      is equal to the number of cores in the TPU system.
    name: (Deprecated) Does nothing.
    xla_options: An instance of `tpu.XLAOptions` which indicates the options
      passed to XLA compiler. Use `None` for default options.
  Returns:
    A list of output tensors.
  Raises:
    ValueError: If num_shards <= 0
    ValueError: If len(input_shard_axes) != len(inputs)
    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
  )	re   rU  rV  rW  rX  r   r#   r=   r   r;   )r[  )
r   re   rU  rV  rW  rX  r   r#   r=   r   r4   r4   r5   shard  s    Jr\  ztpu.batch_parallel)r   re   rU  r   r#   r=   r   c          	   C   s   t | ||||||dS )a  Shards `computation` along the batch dimension for parallel execution.

  Convenience wrapper around shard().

  `inputs` must be a list of Tensors or None (equivalent to an empty list).
  Each input is split into `num_shards` pieces along the 0-th dimension, and
  computation is applied to each shard in parallel.

  Tensors are broadcast to all shards if they are lexically captured by
  `computation`. e.g.,

  x = tf.constant(7)
  def computation():
    return x + 3
  ... = shard(computation, ...)

  The outputs from all shards are concatenated back together along their 0-th
  dimension.

  Inputs and outputs of the computation must be at least rank-1 Tensors.

  Args:
    computation: A Python function that builds a computation to apply to each
      shard of the input.
    inputs: A list of input tensors or None (equivalent to an empty list). The
      0-th dimension of each Tensor must have size divisible by `num_shards`.
    num_shards: The number of shards.
    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
      of arguments as inputs to `computation`.
    device_assignment: If not `None`, a `DeviceAssignment` describing the
      mapping between logical cores in the computation with physical cores in
      the TPU topology. Uses a default device assignment if `None`. The
      `DeviceAssignment` may be omitted if each shard of the computation uses
      only one core, and there is either only one shard, or the number of shards
      is equal to the number of cores in the TPU system.
    name: (Deprecated) Does nothing.
    xla_options: An instance of `tpu.XLAOptions` which indicates the options
      passed to XLA compiler. Use `None` for default options.
  Returns:
    A list of output tensors.
  Raises:
    ValueError: If `num_shards <= 0`
  )rU  r   r#   r=   r   )r\  )r   re   rU  r   r#   r=   r   r4   r4   r5   batch_parallel  s    5r]  ztpu.rewrite)r   re   r   r#   r=   r   r3   c             C   s&   t | |dkrdn|g||||dd S )a  Rewrites `computation` for execution on a TPU system.

  Args:
    computation: A Python function that builds a computation to apply to the
      input. If the function takes n inputs, 'inputs' should be a list of n
      tensors.

      `computation` may return a list of operations and tensors. Tensors must
      come before operations in the returned list.  The return value of
      `rewrite` is a list of tensors corresponding to the tensors from the
      output of `computation`.

      All `Operation`s constructed during `computation` will be executed when
      evaluating any of the returned output tensors, not just the ones returned.
    inputs: A list of input tensors or `None` (equivalent to an empty list).
      Each input can be a nested structure containing values that are
      convertible to tensors. Note that passing an N-dimension list of
      compatible values will result in a N-dimension list of scalar tensors
      rather than a single Rank-N tensors. If you need different behavior,
      convert part of inputs to tensors with `tf.convert_to_tensor`.
    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
      of arguments as inputs to `computation`.
    device_assignment: if not `None`, a `DeviceAssignment` describing the
      mapping between logical cores in the computation with physical cores in
      the TPU topology. May be omitted for a single-core computation, in which
      case the core attached to task 0, TPU device 0 is used.
    name: (Deprecated) Does nothing.
    xla_options: An instance of `tpu.XLAOptions` which indicates the options
      passed to XLA compiler. Use `None` for default options.
  Returns:
    Same data structure as if computation(*inputs) is called directly with some
    exceptions for correctness. Exceptions include:
      1) None output: a NoOp would be returned which control-depends on
         computation.
      2) Single value output: A tuple containing the value would be returned.
      3) Operation-only outputs: a NoOp would be returned which
         control-depends on computation.
      TODO(b/121383831): Investigate into removing these special cases.
  N)r   r#   r=   r   r   )r   )r   re   r   r#   r=   r   r4   r4   r5   rewrite,  s    2r^  ZReadVariableOpZAssignVariableOpZAssignAddVariableOpZAssignSubVariableOpr   Z
VariableV2c              C   sh   t  } xZ| rb|  }x|r2t|tr*dS |j}qW t| tjrH| j} q
t| t	j
r\| j} q
dS q
W dS )z6Check if it is currently under `_TPUInferenceContext`.TFN)r   rI   rJ   rK   _TPUInferenceContextrM   r   Z
_FuncGraphZ_outer_graphr   r   rH   )rP   r   r4   r4   r5   under_tpu_inference_contextu  s    

r`  c                   sV   e Zd ZdZdeed fddZdd Zdd	 Zd
d Z	dd Z
edd Z  ZS )r_  zA `ControlFlowContext` for nodes inside a TPU inference computation.

  The primary role of `_TPUInferenceContext` is to indicate the mode of
  operation and possibly sanity check operators inside a
  tpu.rewrite_for_inference() computation.
  T)r=   	check_opsc                s   t t|   || _|| _d S )N)rq   r_  rr   r|   
_check_ops)r   r=   ra  )rX   r4   r5   rr     s    z_TPUInferenceContext.__init__c             C   s   |  | d S )N)_AddOpInternal)r   r_   r4   r4   r5   r     s    z_TPUInferenceContext.AddOpc             C   s@   | j r*|jtkr*td|j d|j d| jr<| j| d S )NzOperation of type z (z) is not supported on the TPU for inference. Execution will fail if this op is used in the graph. Make sure your variables are using variable_scope.)rb  rb   _DENYLISTED_INFERENCE_OPSr   r=   r   r   )r   r_   r4   r4   r5   rc    s
    z#_TPUInferenceContext._AddOpInternalc             C   s   |}| j r| j |}|S )N)r   r   )r   r   r   r4   r4   r5   r     s    z_TPUInferenceContext.AddValuec             C   s   |  | d S )N)rc  )r   r_   r4   r4   r5   r     s    z_TPUInferenceContext.AddInnerOpc             C   s   d S )Nr4   )r   r4   r4   r5   r     s    z_TPUInferenceContext.grad_state)T)rS   r   r   r   r   rl   rr   r   rc  r   r   r   r   r   r4   r4   )rX   r5   r_    s   
r_  )rP   c             C   s"   t dd |  D stddS )a  Validates whether rewrite_for_inference() 'worked' for variables.

     The rewrite_for_inference() method is supposed to append GuaranteeConstOps
     after ReadVariableOps, but this mechanism works only if you are using
     tf.compat.v1.get_variable() to create and access variables in your tpu
     computation. This validation method can be called immediately after calling
     tpu.rewrite_for_inference() to check whether GuaranteeConstOps where added
     to the graph.

     Typical usages:
       tpu.validate_inference_rewrite_for_variables(
           tf.compat.v1.get_default_graph())

       tpu.validate_inference_rewrite_for_variables(sess.graph)

  Args:
    graph: The graph which needs to be validated.
  Raises:
    RuntimeError: if validation failed.
  c             s   s   | ]}|j d kV  qdS )ZGuaranteeConstN)rb   )r   r   r4   r4   r5   r     s    z;validate_inference_rewrite_for_variables.<locals>.<genexpr>zNo GuaranteeConst ops found in the graph after running tpu.rewrite_for_inference(...). Please check that you are using tf.get_variable() to create and access variables in your tpu computation.N)rY   get_operationsRuntimeError)rP   r4   r4   r5   (validate_inference_rewrite_for_variables  s    rg  )r   re   r   r#   r=   r3   c                s(   dd  fdd}t |||||dS )a  Rewrites `computation` for inference on a TPU system.

     Other than 'rewriting' the computation to run on a TPU, if using variables
     in your computation, it moves the ReadVariableOps outside the TPU
     computation, and adds GuaranteeConst ops just after the ReadVariableOps.
     This mechanism works only if you are using tf.compat.v1.get_variable() to
     create and access variables in your tpu computation. You can validate
     whether this worked, by calling validate_inference_rewrite_for_variables()
     method immediately after this method to check whether GuaranteeConstOps
     where added to the graph.

  Args:
    computation: A Python function that builds a computation to apply to the
      input. If the function takes n inputs, 'inputs' should be a list of n
      tensors. If the function returns m outputs, rewrite will return a list of
      m tensors.
    inputs: A list of input tensors or `None` (equivalent to an empty list).
    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
      of arguments as inputs to `computation`.
    device_assignment: if not `None`, a `DeviceAssignment` describing the
      mapping between logical cores in the computation with physical cores in
      the TPU topology. May be omitted for a single-core computation, in which
      case the core attached to task 0, TPU device 0 is used.
    name: The name of the operator.
  Returns:
    A list of output tensors.
  c          	   _   s4   t d   tj| |f|||d dS Q R X d S )Nz/GuaranteeConst)r=   )r   r?   r   Zguarantee_const)r,  r=   r   r   r4   r4   r5   guarantee_const_getter  s    z5rewrite_for_inference.<locals>.guarantee_const_getterc                 sz   t t dd}zV|  t }|j}|j}|	 |
dd   | |}|	| |
| W d|  X |S )z1Execute computation under `_TPUInferenceContext`.rewrite_for_inference)r=   c             S   s   | j S )N)r   )r_   r4   r4   r5   rV     rW   zDrewrite_for_inference.<locals>.wrapped_computation.<locals>.<lambda>N)r_  r   rI   r9  r   r!   r>  r.  Zcaching_devicer?  Zset_caching_devicer   )r   r   r   rK  Zprev_custom_getterZprev_caching_devicer   )r   rh  r4   r5   wrapped_computation  s    



z2rewrite_for_inference.<locals>.wrapped_computation)re   r   r#   r=   )r^  )r   re   r   r#   r=   rj  r4   )r   rh  r5   ri    s    "ri  )prune_graphc             C   s   x| gdd | j  D  D ]r}t|tjs.qx^| D ]R}|jtkrHq8d}x|jD ]}|	 rTd}P qTW |s8t
d|j|j |t q8W qW dS )a  Prunes unconnected ops as listed in _UNCONNECTED_OPS_TO_PRUNE.

  Args:
    prune_graph: A tensorflow graph from which we wish to prune unconnected ops
      as listed in _UNCONNECTED_OPS_TO_PRUNE.  In general, these ops should have
      no inputs and no consumers. These can often be left behind due to graph
      construction rewiring (for instance TF-Hub). While they never execute,
      they will cause XLA compile to fail so we strip them from XLA compile by
      removing the tpu_replicate attribute.
  c             S   s   g | ]}|qS r4   r4   )r   fr4   r4   r5   r   	  s    z2prune_unconnected_ops_from_xla.<locals>.<listcomp>FTzGPruning OP %s of type %s from XLA Compile due to it being disconnected.N)Z
_functionsvaluesrK   r   Graphre  rb   _UNCONNECTED_OPS_TO_PRUNEr   Z	consumersr
   r6  r=   Z_clear_attrr   )rk  rP   r_   Zoutputs_consumedoutputr4   r4   r5   prune_unconnected_ops_from_xla	  s"    
rq  )NNTN)N)N)NNNNNNN)N)NNNNTNNN)	Nr;   NTNNNNN)	Nr;   NTNNNNN)Nr;   NNNN)NNNNN)NNNN)r   collectionsenumrB  r   r   r   r   r   r   r   r	   Zabslr
   numpyr  Z!tensorflow.compiler.tf2xla.pythonr   r<  Ztensorflow.core.frameworkr   Ztensorflow.core.protobuf.tpur   r  r   Zembedding_pb2Ztensorflow.pythonr   Ztensorflow.python.compiler.xlaZtensorflow.python.distributer   r   Ztensorflow.python.frameworkr   r   r   r   r   r   r   r   r   r   r   r   r   Ztensorflow.python.opsr   r   r    r!   r"   rD  r#   Zdevice_assignment_libr$   r%   r&   Ztensorflow.python.tpu.opsr'   Ztensorflow.python.typesr(   r   Ztensorflow.python.utilr)   r*   r+   r,   r-   Z tensorflow.python.util.tf_exportr.   ZNotDifferentiablesetr   r   ro  r   r   r  rH  r   r;  r6   ZTPUEmbeddingConfigurationrl   r   rD   r   rE   rG   rQ   r\   ZDeviceAssignmentr^   Zregister_acd_resource_resolverZObjectIdentitySetrm   ZXLAControlFlowContextrL   r   r   r   IntEnumr   
namedtupler   Zfilter_tracebackZInfeedQueuer   r  r)  r  r  r!  r   r@  rA  r   r[  r\  r]  r^  rd  r`  r_  rn  rg  ri  rq  r4   r4   r4   r5   <module>   sn  (

   6
	    :

y
	
      Zc$ 
-       F   ,_,>        ` 
        pK
     N6
    N6(   BA