o
    ?e                    @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddlm	Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z! e
j"j#Z#e
j"j$Z$G dd de%dg dZ&G dd de%dg dZ'G dd de%dg dZ(G dd de%dg dZ)d d! Z*d"d# Z+e%d$d%d&gZ,e%d'd(gZ-e%d)d*gZ.e%d+d(d*gZ/e%d,d-d.gZ0e%d/d(gZ1e%d0d(d1gZ2e%d2d&d%gZ3e%d3d4gZ4e%d5d%d&gZ5e%d6d*gZ6e%d7d(d*gZ7e%d8d-d.gZ8e%d9d(gZ9e%d:d(gZ:e%d;d(d1gZ;e%d<d&d%gZ<e%d=d4gZ=e%d>g d?Z>G d@dA dAZ?e!dBgdCG dDdE dEe?Z@G dFdG dGe?ZAG dHdI dIe?ZBe!dJgdCG dKdL dLe?ZCe!dMgdCG dNdO dOe?ZDG dPdQ dQe?ZEG dRdS dSe?ZFG dTdU dUe?ZGe!dVgdCG dWdX dXe?ZHG dYdZ dZe?ZIe%d[g d\ZJG d]d^ d^ZKd_d` ZLdadb ZMdcdd ZNdedf ZOG dgdh dhZPG didj djePZQG dkdl dlePZRG dmdn dnePZSG dodp dpePZTG dqdr drePZUG dsdt dtePZVG dudv dvePZWG dwdx dxePZXG dydz dzePZYG d{d| d|ePZZd}d~ Z[dd Z\dd Z]dd Z^dd Z_	dddZ`dS )zTPU embedding APIs.    N)Optional)optimization_parameters_pb2)tpu_embedding_configuration_pb2)context)dtypes)ops)	array_ops)control_flow_ops)init_ops)math_ops)partitioned_variables)	state_ops)variable_scope)
tf_logging)tpu_system_metadata)tpu_ops)	tf_exportc                       s.   e Zd ZdZ						d fdd	Z  ZS )TableConfigzEmbedding table configuration.NmeanFc	           	         s   t |tr	|dk rtd| dt |tr|dk r"td| d|dur2t|s2td| d|du rBtjddt| d}|d	vrNtd
| d|dur^|dur^td|||durqt |t	sqtdt
| dt | ||||||||	S )aP	  Embedding table configuration.

    Args:
      vocabulary_size: Number of vocabulary (/rows) in the table.
      dimension: The embedding dimension.
      initializer: A variable initializer function to be used in embedding
        variable initialization. If not specified, defaults to
        `tf.compat.v1.truncated_normal_initializer` with mean `0.0` and standard
        deviation `1/sqrt(dimension)`.
      combiner: A string specifying how to reduce if there are multiple entries
        in a single row. Currently 'mean', 'sqrtn', 'sum' and None are
        supported, with 'mean' the default. 'sqrtn' often achieves good
        accuracy, in particular with bag-of-words columns. For more information,
        see `tf.nn.embedding_lookup_sparse`. None is only valid for dense rather
        than sparse tensors.
      hot_id_replication: If true, enables hot id replication, which can make
        embedding lookups faster if there are some hot rows in the table.
      learning_rate: float, static learning rate for this table. If
        learning_rate and learning_rate_fn are both `None`, static learning rate
        as specified in local `optimization_parameters` will be used. In case
        local `optimization_parameters` is `None`, global
        `optimization_parameters` in `TPUEmbedding` constructor will be used.
        `learning_rate_fn` must be `None` if `learning_rate` is not `None.
      learning_rate_fn: string, use dynamic learning rate given by the function.
        This function will be passed the current global step. If learning_rate
        and learning_rate_fn are both `None`, static learning rate as specified
        in `optimization_parameters` is used. `learning_rate` must be `None` if
        `learning_rate_fn` is not `None.
      optimization_parameters: `AdagradParameters`, `AdamParameters`,
        `Stochasticgradientdescentparameters`. Specifies table level optimizer.
        If it's `None` global optimizer in `TPUEmbedding` constructor is used.

    Returns:
      `TableConfig`.

    Raises:
      ValueError: if `vocabulary_size` is not positive integer.
      ValueError: if `dimension` is not positive integer.
      ValueError: if `initializer` is specified and is not callable.
      ValueError: if `combiner` is not supported.
      ValueError: if `learning_rate` and `learning_rate_fn` are both not
        `None`.
       z%vocabulary_size must >= 1. Received: .z,dimension must be a positive int. Received: Nz5initializer must be callable if specified. Received:         )r   Zstddev)r   sumZsqrtnNz;combiner must be "mean", "sum", "sqrtn" or None. Received: zRAt most one of learning_rate and learning_rate_fn can be None. Received: {} and {}zq`optimization_parameters` must inherit from `_OptimizationParameters`. Received: `type(optimization_parameters)`=)
isinstanceint
ValueErrorcallabler
   Ztruncated_normal_initializermathsqrtformat_OptimizationParameterstypesuper__new__)	clsvocabulary_size	dimensioninitializercombinerhot_id_replicationlearning_ratelearning_rate_fnoptimization_parameters	__class__ d/home/www/facesmatcher.com/pyenv/lib/python3.10/site-packages/tensorflow/python/tpu/tpu_embedding.pyr#   ;   sF   4





zTableConfig.__new__)Nr   FNNN__name__
__module____qualname____doc__r#   __classcell__r/   r/   r-   r0   r   .   s    r   )r%   r&   r'   r(   r)   r*   r+   r,   c                       s"   e Zd ZdZd fdd	Z  ZS )FeatureConfigzFeature configuration.r   Nc                    s4   t |tr	|dk rtd| dt | |||S )aW  Feature configuration.

    Args:
      table_id: Which table the feature is uses for embedding lookups.
      max_sequence_length: If positive, the feature is a sequence feature with
        the corresponding maximum sequence length. If the sequence is longer
        than this, it will be truncated. If 0, the feature is not a sequence
        feature.
      weight_key: If using weights for the combiner, this key specifies which
        input feature contains the weights.

    Returns:
      `FeatureConfig`.

    Raises:
      ValueError: if `max_sequence_length` non-integer or negative.
    r   z8max_sequence_length must be zero or a positive int, got r   )r   r   r   r"   r#   )r$   table_idmax_sequence_length
weight_keyr-   r/   r0   r#      s
   
zFeatureConfig.__new__)r   Nr1   r/   r/   r-   r0   r7      s    r7   )r8   r9   r:   c                       4   e Zd ZdZ		d fdd	ZedddZ  ZS )	EnqueueDataz3Data to be enqueued through generate_enqueue_ops().Nc                       t  | |||S )a  Data to be enqueued through generate_enqueue_ops().

    Args:
      embedding_indices: A rank 1 Tensor, indices into the embedding tables. It
        corresponds to sp_ids.values in embedding_lookup_sparse(). Both int32
        and int64 are allowed and will be converted to int32 internally.
      sample_indices: A rank 2 Tensor specifying the training example to which
        the corresponding embedding_indices and aggregation_weights values
        belong. It corresponds to sp_ids.indices in embedding_lookup_sparse().
        If it is None, we assume each embedding_indices belongs to a different
        sample. Both int32 and int64 are allowed and will be converted to int32
        internally.
      aggregation_weights: A rank 1 Tensor containing aggregation weights. It
        corresponds to sp_weights.values in embedding_lookup_sparse(). If it is
        None, we assume all weights are 1. Both float32 and float64 are allowed
        and will be converted to float32 internally.

    Returns:
      An EnqueueData tuple.

    r"   r#   )r$   embedding_indicessample_indicesaggregation_weightsr-   r/   r0   r#      s   zEnqueueData.__new__c                 C   $   t | j| j|d ur|jdS d dS N)rA   )r<   valuesindices)Z	sp_tensorweightsr/   r/   r0   from_sparse_tensor      zEnqueueData.from_sparse_tensorNNN)r2   r3   r4   r5   r#   staticmethodrG   r6   r/   r/   r-   r0   r<      s    r<   )r?   r@   rA   c                       r;   )	RaggedEnqueueDataz@RaggedTensor Data to be enqueued through generate_enqueue_ops().Nc                    r=   )a  Data to be enqueued through generate_enqueue_ops().

    Args:
      embedding_indices: A rank 1 Tensor, indices into the embedding tables. It
        corresponds to ids.values in embedding_lookup(), when ids is a
        RaggedTensor. Both int32 and int64 are allowed and will be converted to
        int32 internally.
      row_splits: A rank 1 Tensor specifying the length of  the break points for
        splitting embedding_indices and aggregation_weights. It corresponds to
        ids.row_splits in embedding_lookup(), when ids is a RaggedTensor. Both
        int32 and int64 are allowed and will be converted to int32 internally.
      aggregation_weights: A rank 1 Tensor containing per training example
        aggregation weights. It corresponds to the values field of a
        RaggedTensor with the same row_splits as ids in embedding_lookup(), when
        ids is a RaggedTensor.

    Returns:
      An RaggedEnqueueData tuple.

    r>   )r$   r?   
row_splitsrA   r-   r/   r0   r#      s   zRaggedEnqueueData.__new__c                 C   rB   rC   )rL   rD   rM   )Z	rg_tensorrF   r/   r/   r0   from_ragged_tensor   rH   z$RaggedEnqueueData.from_ragged_tensorrI   rJ   )r2   r3   r4   r5   r#   rK   rN   r6   r/   r/   r-   r0   rL      s    rL   )r?   rM   rA   c                 C   4   g }| D ]}t dd | D }|| q|S )a  Convenient function for generate_enqueue_ops().

  Args:
    sp_tensors_list: a list of dictionary mapping from string of feature names
      to SparseTensor. Each dictionary is for one TPU core. Dictionaries for the
      same host should be contiguous on the list.

  Returns:
    enqueue_datas_list: a list of dictionary mapping from string
      of feature names to EnqueueData. Each dictionary is for one
      TPU core. Dictionaries for the same host should be contiguous
      on the list.

  c                 s   "    | ]\}}|t |fV  qd S rJ   )r<   rG   .0kvr/   r/   r0   	<genexpr>  s    
zBget_enqueue_datas_list_from_sparse_tensors_list.<locals>.<genexpr>collectionsOrderedDictitemsappend)Zsp_tensors_listenqueue_datas_listZ
sp_tensorsenqueue_datasr/   r/   r0   /get_enqueue_datas_list_from_sparse_tensors_list  s   
r]   c                 C   rO   )a  Convenient function for generate_enqueue_ops().

  Args:
    rg_tensors_list: a list of dictionary mapping from string of feature names
      to RaggedTensor. Each dictionary is for one TPU core. Dictionaries for the
      same host should be contiguous on the list.

  Returns:
    enqueue_datas_list: a list of dictionary mapping from string
      of feature names to RaggedEnqueueData. Each dictionary is for one
      TPU core. Dictionaries for the same host should be contiguous
      on the list.

  c                 s   rP   rJ   )rL   rN   rQ   r/   r/   r0   rU   ,  s
    
zBget_enqueue_datas_list_from_ragged_tensors_list.<locals>.<genexpr>rV   )Zrg_tensors_listr[   Z
rg_tensorsr\   r/   r/   r0   /get_enqueue_datas_list_from_ragged_tensors_list  s   
r^   AdamSlotVariableNamesmrT   AdagradSlotVariableNamesaccumulatorMomentumSlotVariableNamesmomenta AdagradMomentumSlotVariableNamesRMSPropSlotVariableNamesmsmom ProximalAdagradSlotVariableNamesFtrlSlotVariableNameslinearProximalYogiSlotVariableNames#FrequencyEstimatorSlotVariableNameslast_hit_stepAdamSlotVariablesMomentumSlotVariablesAdagradMomentumSlotVariablesRMSPropSlotVariablesAdagradSlotVariablesProximalAdagradSlotVariablesFtrlSlotVariableProximalYogiSlotVariablesFrequencyEstimatorSlotVariablesVariablesAndOps)embedding_variables_by_tableslot_variables_by_tableload_opsretrieve_opsc                   @   sX   e Zd ZdZ		ddededee dee dee dee d	ee d
ee fddZdS )r    z'Parameters common to all optimizations.Nr*   use_gradient_accumulationclip_weight_minclip_weight_maxweight_decay_factor-multiply_weight_decay_factor_by_learning_rateclip_gradient_minclip_gradient_maxc	           	      C   sT   || _ || _|| _|| _|| _|| _|| _|| _|s&|d us"|d ur(tdd S d S )NzLWhen using gradient clipping limits, gradient  accumulation must be enabled.)	r*   r}   r~   r   r   r   r   r   r   	selfr*   r}   r~   r   r   r   r   r   r/   r/   r0   __init__q  s   z _OptimizationParameters.__init__rI   )r2   r3   r4   r5   floatboolr   r   r/   r/   r/   r0   r    n  s*    
	r    z"tpu.experimental.AdagradParameters)v1c                       sp   e Zd ZdZ								ddedededee d	ee d
ee dee dee dee f fddZ  ZS )AdagradParametersa2  Optimization parameters for Adagrad with TPU embeddings.

  Pass this to `tf.estimator.tpu.experimental.EmbeddingConfigSpec` via the
  `optimization_parameters` argument to set the optimizer and its parameters.
  See the documentation for `tf.estimator.tpu.experimental.EmbeddingConfigSpec`
  for more details.

  ```
  estimator = tf.estimator.tpu.TPUEstimator(
      ...
      embedding_spec=tf.estimator.tpu.experimental.EmbeddingConfigSpec(
          ...
          optimization_parameters=tf.tpu.experimental.AdagradParameters(0.1),
          ...))
  ```

  皙?TNr*   initial_accumulatorr}   r~   r   r   r   r   r   c
           
   
      s>   t  j||||||||	d |dkrtd| d|| _dS )a  Optimization parameters for Adagrad.

    Args:
      learning_rate: used for updating embedding table.
      initial_accumulator: initial accumulator for Adagrad.
      use_gradient_accumulation: setting this to `False` makes embedding
        gradients calculation less accurate but faster. Please see
        `optimization_parameters.proto` for details.
      clip_weight_min: the minimum value to clip by; None means -infinity.
      clip_weight_max: the maximum value to clip by; None means +infinity.
      weight_decay_factor: amount of weight decay to apply; None means that the
        weights are not decayed.
      multiply_weight_decay_factor_by_learning_rate: if true,
        `weight_decay_factor` is multiplied by the current learning rate.
      clip_gradient_min: the minimum value to clip by; None means -infinity.
        Gradient accumulation must be set to true if this is set.
      clip_gradient_max: the maximum value to clip by; None means +infinity.
        Gradient accumulation must be set to true if this is set.
    r*   r}   r~   r   r   r   r   r   r   zAAdagrad initial_accumulator must be greater than zero. Received: r   N)r"   r   r   r   )
r   r*   r   r}   r~   r   r   r   r   r   r-   r/   r0   r     s"   
zAdagradParameters.__init__)r   TNNNNNN	r2   r3   r4   r5   r   r   r   r   r6   r/   r/   r-   r0   r     s:    	
r   c                       s   e Zd ZdZ											dded	ed
edededededee dee dee dee dee dee f fddZ  ZS )AdagradMomentumParametersaE  Optimization parameters for Adagrad + Momentum with TPU embeddings.

  Pass this to `tf.estimator.tpu.experimental.EmbeddingConfigSpec` via the
  `optimization_parameters` argument to set the optimizer and its parameters.
  See the documentation for `tf.estimator.tpu.experimental.EmbeddingConfigSpec`
  for more details.

  ```
  estimator = tf.estimator.tpu.TPUEstimator(
      ...
      embedding_spec=tf.estimator.tpu.experimental.EmbeddingConfigSpec(
          ...
          optimization_parameters=tf.tpu.experimental.AdagradMomentumParameters(0.1),
          ...))
  ```

  F   r   绽|=TNr*   momentumuse_nesterovexponentbeta2epsilonr}   r~   r   r   r   r   r   c              
      s^   t  j||||	|
|||d |dkrtd|dkrtd|| _|| _|| _|| _|| _dS )a?  Optimization parameters for Adagrad.

    Args:
      learning_rate: used for updating embedding table.
      momentum: Moving average parameter for the momentum accumulator.
      use_nesterov: Whether to use the Nesterov variant of momentum. See
        Sutskever et al., 2013.
      exponent: Exponent for the Adagrad accumulator.
      beta2: Moving average parameter for the Adagrad accumulator.
      epsilon: initial accumulator for Adagrad accumulator.
      use_gradient_accumulation: setting this to `False` makes embedding
        gradients calculation less accurate but faster. Please see
        `optimization_parameters.proto` for details.
      clip_weight_min: the minimum value to clip by; None means -infinity.
      clip_weight_max: the maximum value to clip by; None means +infinity.
      weight_decay_factor: amount of weight decay to apply; None means that the
        weights are not decayed.
      multiply_weight_decay_factor_by_learning_rate: if true,
        `weight_decay_factor` is multiplied by the current learning rate.
      clip_gradient_min: the minimum value to clip by; None means -infinity.
        Gradient accumulation must be set to true if this is set.
      clip_gradient_max: the maximum value to clip by; None means +infinity.
        Gradient accumulation must be set to true if this is set.
    r   r   z*Adagrad momentum: epsilon must be positivez/Adagrad momentum: Precondition exponent must >0N)r"   r   r   r   r   r   r   r   )r   r*   r   r   r   r   r   r}   r~   r   r   r   r   r   r-   r/   r0   r     s&   (
z"AdagradMomentumParameters.__init__)Fr   r   r   TNNNNNNr   r/   r/   r-   r0   r     sP    	
r   c                       s|   e Zd ZdZ										ddededed	ed
edee dee dee dee dee dee f fddZ  ZS )ProximalAdagradParametersaA  Optimization parameters for ProximalAdagrad with TPU embeddings.

  Pass this to `tf.estimator.tpu.experimental.EmbeddingConfigSpec` via the
  `optimization_parameters` argument to set the optimizer and its parameters.
  See the documentation for `tf.estimator.tpu.experimental.EmbeddingConfigSpec`
  for more details.
  r   r   TNr*   r   l1_regularization_strengthl2_regularization_strengthr}   r~   r   r   r   r   r   c              
      sv   t  j||||||	|
|d |dkrtd| d|dk r%td||dk r0td||| _|| _|| _dS )	a  Optimization parameters for Adagrad.

    Args:
      learning_rate: used for updating embedding table.
      initial_accumulator: initial accumulator for Adagrad.
      l1_regularization_strength: A float value, must be greater than or equal
        to zero.
      l2_regularization_strength: A float value, must be greater than or equal
        to zero.
      use_gradient_accumulation: setting this to `False` makes embedding
        gradients calculation less accurate but faster. Please see
        `optimization_parameters.proto` for details. for details.
      clip_weight_min: the minimum value to clip by; None means -infinity.
      clip_weight_max: the maximum value to clip by; None means +infinity.
      weight_decay_factor: amount of weight decay to apply; None means that the
        weights are not decayed.
      multiply_weight_decay_factor_by_learning_rate: if true,
        `weight_decay_factor` is multiplied by the current learning rate.
      clip_gradient_min: the minimum value to clip by; None means -infinity.
        Gradient accumulation must be set to true if this is set.
      clip_gradient_max: the maximum value to clip by; None means +infinity.
        Gradient accumulation must be set to true if this is set.
    r   r   z8Adagrad initial_accumulator must be positive. Received: r   r   Fl1_regularization_strength must be greater than or equal to 0. got {}.Fl2_regularization_strength must be greater than or equal to 0. got {}.N)r"   r   r   r   r   r   r   )r   r*   r   r   r   r}   r~   r   r   r   r   r   r-   r/   r0   r   +  s2   %

z"ProximalAdagradParameters.__init__)
r   r   r   TNNNNNNr   r/   r/   r-   r0   r   "  sF    	
r   ztpu.experimental.AdamParametersc                       s   e Zd ZdZ												ddeded	ed
ededededee dee dee dee dee dee f fddZ  ZS )AdamParametersa3  Optimization parameters for Adam with TPU embeddings.

  Pass this to `tf.estimator.tpu.experimental.EmbeddingConfigSpec` via the
  `optimization_parameters` argument to set the optimizer and its parameters.
  See the documentation for `tf.estimator.tpu.experimental.EmbeddingConfigSpec`
  for more details.

  ```
  estimator = tf.estimator.tpu.TPUEstimator(
      ...
      embedding_config_spec=tf.estimator.tpu.experimental.EmbeddingConfigSpec(
          ...
          optimization_parameters=tf.tpu.experimental.AdamParameters(0.1),
          ...))
  ```

  ?+?:0yE>TNr*   beta1r   r   	lazy_adamsum_inside_sqrtr}   r~   r   r   r   r   r   c              
      s   t  j||||	|
|||d |dk s|dkrtd||dk s%|dkr,td||dkr7td||s?|s?td|| _|| _|| _|| _|| _dS )	a  Optimization parameters for Adam.

    Args:
      learning_rate: a floating point value. The learning rate.
      beta1: A float value. The exponential decay rate for the 1st moment
        estimates.
      beta2: A float value. The exponential decay rate for the 2nd moment
        estimates.
      epsilon: A small constant for numerical stability.
      lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster. See
        `optimization_parameters.proto` for details.
      sum_inside_sqrt: This improves training speed. Please see
        `optimization_parameters.proto` for details.
      use_gradient_accumulation: setting this to `False` makes embedding
        gradients calculation less accurate but faster. Please see
        `optimization_parameters.proto` for details.
      clip_weight_min: the minimum value to clip by; None means -infinity.
      clip_weight_max: the maximum value to clip by; None means +infinity.
      weight_decay_factor: amount of weight decay to apply; None means that the
        weights are not decayed.
      multiply_weight_decay_factor_by_learning_rate: if true,
        `weight_decay_factor` is multiplied by the current learning rate.
      clip_gradient_min: the minimum value to clip by; None means -infinity.
        Gradient accumulation must be set to true if this is set.
      clip_gradient_max: the maximum value to clip by; None means +infinity.
        Gradient accumulation must be set to true if this is set.
    r   r         ?'beta1 must be between 0. and 1; got {}.'beta2 must be between 0. and 1; got {}.!epsilon must be positive; got {}.z=When disabling Lazy Adam, gradient accumulation must be used.N)	r"   r   r   r   r   r   r   r   r   )r   r*   r   r   r   r   r   r}   r~   r   r   r   r   r   r-   r/   r0   r     s2   +
zAdamParameters.__init__)r   r   r   TTTNNNNNNr   r/   r/   r-   r0   r   k  sR    	
r   ztpu.experimental.FtrlParametersc                        s   e Zd ZdZ														dd	ed
edededededee dee dee dee dedededee dee f fddZ  ZS )FtrlParametersa3  Optimization parameters for Ftrl with TPU embeddings.

  Pass this to `tf.estimator.tpu.experimental.EmbeddingConfigSpec` via the
  `optimization_parameters` argument to set the optimizer and its parameters.
  See the documentation for `tf.estimator.tpu.experimental.EmbeddingConfigSpec`
  for more details.

  ```
  estimator = tf.estimator.tpu.TPUEstimator(
      ...
      embedding_config_spec=tf.estimator.tpu.experimental.EmbeddingConfigSpec(
          ...
          optimization_parameters=tf.tpu.experimental.FtrlParameters(0.1),
          ...))
  ```

        r   r   TNFr   r*   learning_rate_powerinitial_accumulator_valuer   r   r}   r~   r   r   r    multiply_linear_by_learning_ratebetaallow_zero_accumulatorr   r   c              
      s   t  j|||||	|
||d |dkrtd||dk r$td||dk r/td||dk r:td||| _|| _d| _|| _|| _|| _	|| _
|| _dS )a  Optimization parameters for Ftrl.

    Implements FTRL as described in the following [paper](
    https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41159.pdf)

    Args:
      learning_rate: a floating point value. The learning rate.
      learning_rate_power: A float value, must be less or equal to zero.
        Controls how the learning rate decreases during training. Use zero for a
        fixed learning rate. See section 3.1 in the
        [paper](https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf).
      initial_accumulator_value: The starting value for accumulators. Only zero
        or positive values are allowed.
      l1_regularization_strength: A float value, must be greater than or equal
        to zero.
      l2_regularization_strength: A float value, must be greater than or equal
        to zero.
      use_gradient_accumulation: setting this to `False` makes embedding
        gradients calculation less accurate but faster. Please see
        `optimization_parameters.proto` for details. for details.
      clip_weight_min: the minimum value to clip by; None means -infinity.
      clip_weight_max: the maximum value to clip by; None means +infinity.
      weight_decay_factor: amount of weight decay to apply; None means that the
        weights are not decayed.
      multiply_weight_decay_factor_by_learning_rate: if true,
        `weight_decay_factor` is multiplied by the current learning rate.
      multiply_linear_by_learning_rate: When true, multiplies the usages of the
        linear slot in the weight update by the learning rate. This is useful
        when ramping up learning rate from 0 (which would normally produce
        NaNs).
      beta: The beta parameter for FTRL.
      allow_zero_accumulator: Changes the implementation of the square root to
        allow for the case of initial_accumulator_value being zero. This will
        cause a slight performance drop.
      clip_gradient_min: the minimum value to clip by; None means -infinity.
        Gradient accumulation must be set to true if this is set.
      clip_gradient_max: the maximum value to clip by; None means +infinity.
        Gradient accumulation must be set to true if this is set.
    r   r   z<learning_rate_power must be less than or equal to 0. got {}.zEinitial_accumulator_value must be greater than or equal to 0. got {}.r   r   N)r"   r   r   r   r   r   initial_linear_valuer   r   r   r   r   )r   r*   r   r   r   r   r}   r~   r   r   r   r   r   r   r   r   r-   r/   r0   r     sD   9
zFtrlParameters.__init__)r   r   r   r   TNNNNFr   FNNr   r/   r/   r-   r0   r     s^    	
r   c                       s   e Zd ZdZ																				dd
ededededededededee dee dee dee dee dee f fddZ  ZS )ProximalYogiParametersa  Optimization parameters for Proximal Yogi with TPU embeddings.

  Implements the Yogi optimizer as described in
  [Adaptive Methods for Nonconvex
  Optimization](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization).

  Pass this to `tf.estimator.tpu.experimental.EmbeddingConfigSpec` via the
  `optimization_parameters` argument to set the optimizer and its parameters.
  See the documentation for `tf.estimator.tpu.experimental.EmbeddingConfigSpec`
  for more details.
  {Gz?r   r   MbP?r   ư>TNr*   r   r   r   r   r   r   r}   r~   r   r   r   r   r   c              
      s   t  j|||	|
||||d |dk s|dkrtd||dk s%|dkr,td||dkr7td||dk rBtd||dk rMtd||| _|| _|| _|| _|| _|| _	d	S )
a   Optimization parameters for Proximal Yogi.

    Args:
      learning_rate: a floating point value. The learning rate.
      beta1: A float value. The exponential decay rate for the 1st moment
        estimates.
      beta2: A float value. The exponential decay rate for the 2nd moment
        estimates.
      epsilon: A small constant for numerical stability.
      l1_regularization_strength: A float value, must be greater than or equal
        to zero.
      l2_regularization_strength: A float value, must be greater than or equal
        to zero.
      initial_accumulator_value: The starting value for accumulators. Only zero
        or positive values are allowed.
      use_gradient_accumulation: setting this to `False` makes embedding
        gradients calculation less accurate but faster. Please see
        `optimization_parameters.proto` for details. for details.
      clip_weight_min: the minimum value to clip by; None means -infinity.
      clip_weight_max: the maximum value to clip by; None means +infinity.
      weight_decay_factor: amount of weight decay to apply; None means that the
        weights are not decayed.
      multiply_weight_decay_factor_by_learning_rate: if true,
        `weight_decay_factor` is multiplied by the current learning rate.
      clip_gradient_min: the minimum value to clip by; None means -infinity.
        Gradient accumulation must be set to true if this is set.
      clip_gradient_max: the maximum value to clip by; None means +infinity.
        Gradient accumulation must be set to true if this is set.
    r   r   r   r   r   r   r   r   N)
r"   r   r   r   r   r   r   r   r   r   )r   r*   r   r   r   r   r   r   r}   r~   r   r   r   r   r   r-   r/   r0   r   H  s<   .
zProximalYogiParameters.__init__)r   r   r   r   r   r   r   TNNNNNNr   r/   r/   r-   r0   r   8  sZ    	
r   c                       st   e Zd ZdZ								ddedededed	ee d
ee dee dee dee dee f fddZ  ZS )MomentumParametersa4  Optimization parameters for Momentum with TPU embeddings.

  Pass this to `tf.estimator.tpu.experimental.EmbeddingConfigSpec` via the
  `optimization_parameters` argument to set the optimizer and its parameters.
  See the documentation for `tf.estimator.tpu.experimental.EmbeddingConfigSpec`
  for more details.

  ```
  estimator = tf.estimator.tpu.TPUEstimator(
      ...
      embedding_spec=tf.estimator.tpu.experimental.EmbeddingConfigSpec(
          ...
          optimization_parameters=tf.tpu.experimental.MomentumParameters(0.1),
          ...))
  ```

  FTNr*   r   r   r}   r~   r   r   r   r   r   c              
      s,   t  j|||||||	|
d || _|| _dS )a  Optimization parameters for momentum.

    Args:
      learning_rate: a floating point value. The learning rate.
      momentum: a floating point value.  The momentum.
      use_nesterov: If `True` use Nesterov Momentum. See (Sutskever et al.,
        2013). This implementation always computes gradients at the value of the
        variable(s) passed to the optimizer. Using Nesterov Momentum makes the
        variable(s) track the values called `theta_t + mu*v_t` in the paper.
        This implementation is an approximation of the original formula, valid
        for high values of momentum. It will compute the "adjusted gradient" in
        NAG by assuming that the new gradient will be estimated by the current
        average gradient plus the product of momentum and the change in the
        average gradient.
      use_gradient_accumulation: setting this to `False` makes embedding
        gradients calculation less accurate but faster. Please see
        `optimization_parameters.proto` for details.
      clip_weight_min: the minimum value to clip by; None means -infinity.
      clip_weight_max: the maximum value to clip by; None means +infinity.
      weight_decay_factor: amount of weight decay to apply; None means that the
        weights are not decayed.
      multiply_weight_decay_factor_by_learning_rate: if true,
        `weight_decay_factor` is multiplied by the current learning rate.
      clip_gradient_min: the minimum value to clip by; None means -infinity.
        Gradient accumulation must be set to true if this is set.
      clip_gradient_max: the maximum value to clip by; None means +infinity.
        Gradient accumulation must be set to true if this is set.
    r   N)r"   r   r   r   )r   r*   r   r   r}   r~   r   r   r   r   r   r-   r/   r0   r     s   )
zMomentumParameters.__init__)FTNNNNNNr   r/   r/   r-   r0   r     s>    	
r   c                       sv   e Zd ZdZ							ddededededed	ee d
ee dee dee dee dee f fddZ  ZS )RMSPropParametersa3  Optimization parameters for RMSProp with TPU embeddings.

  Pass this to `tf.estimator.tpu.experimental.EmbeddingConfigSpec` via the
  `optimization_parameters` argument to set the optimizer and its parameters.
  See the documentation for `tf.estimator.tpu.experimental.EmbeddingConfigSpec`
  for more details.

  ```
  estimator = tf.estimator.tpu.TPUEstimator(
      ...
      embedding_spec=tf.estimator.tpu.experimental.EmbeddingConfigSpec(
          ...
          optimization_parameters=tf.tpu.experimental.MomentumParameters(0.1),
          ...))
  ```

  TNr*   rhor   r   r}   r~   r   r   r   r   r   c              
      s2   t  j||||||	|
|d || _|| _|| _dS )a  Optimization parameters for RMS prop.

    Args:
      learning_rate: a floating point value. The learning rate.
      rho: Discounting factor for the history/coming gradient
      momentum: A scalar tensor.
      epsilon: Small value to avoid zero denominator.
      use_gradient_accumulation: setting this to `False` makes embedding
        gradients calculation less accurate but faster. Please see
        `optimization_parameters.proto` for details. for details.
      clip_weight_min: the minimum value to clip by; None means -infinity.
      clip_weight_max: the maximum value to clip by; None means +infinity.
      weight_decay_factor: amount of weight decay to apply; None means that the
        weights are not decayed.
      multiply_weight_decay_factor_by_learning_rate: if true,
        `weight_decay_factor` is multiplied by the current learning rate.
      clip_gradient_min: the minimum value to clip by; None means -infinity.
        Gradient accumulation must be set to true if this is set.
      clip_gradient_max: the maximum value to clip by; None means +infinity.
        Gradient accumulation must be set to true if this is set.
    r   N)r"   r   r   r   r   )r   r*   r   r   r   r}   r~   r   r   r   r   r   r-   r/   r0   r     s   #
zRMSPropParameters.__init__TNNNNNNr   r/   r/   r-   r0   r     s@    	
r   z4tpu.experimental.StochasticGradientDescentParametersc                       sj   e Zd ZdZ							ddededee dee dee d	ee d
ee dee f fddZ  ZS )#StochasticGradientDescentParametersa`  Optimization parameters for stochastic gradient descent for TPU embeddings.

  Pass this to `tf.estimator.tpu.experimental.EmbeddingConfigSpec` via the
  `optimization_parameters` argument to set the optimizer and its parameters.
  See the documentation for `tf.estimator.tpu.experimental.EmbeddingConfigSpec`
  for more details.

  ```
  estimator = tf.estimator.tpu.TPUEstimator(
      ...
      embedding_config_spec=tf.estimator.tpu.experimental.EmbeddingConfigSpec(
          ...
          optimization_parameters=(
              tf.tpu.experimental.StochasticGradientDescentParameters(0.1))))
  ```

  TNr*   r}   r~   r   r   r   r   r   c	           	   
      s    t  j||||||||d dS )ar  Optimization parameters for stochastic gradient descent.

    Args:
      learning_rate: a floating point value. The learning rate.
      use_gradient_accumulation: setting this to `False` makes embedding
        gradients calculation less accurate but faster. Please see
        `optimization_parameters.proto` for details.
      clip_weight_min: the minimum value to clip by; None means -infinity.
      clip_weight_max: the maximum value to clip by; None means +infinity.
      weight_decay_factor: amount of weight decay to apply; None means that the
        weights are not decayed.
      multiply_weight_decay_factor_by_learning_rate: if true,
        `weight_decay_factor` is multiplied by the current learning rate.
      clip_gradient_min: the minimum value to clip by; None means -infinity.
      clip_gradient_max: the maximum value to clip by; None means +infinity.
    r   N)r"   r   r   r-   r/   r0   r   ;  s   
z,StochasticGradientDescentParameters.__init__r   r   r/   r/   r-   r0   r   '  s4    	r   c                       s2   e Zd ZdZdedededef fddZ  ZS )FrequencyEstimatorParametersay  Optimization parameters for Frequency Estimator TPU embeddings.

  This is a non-standard optimizer, which returns the estimated frequency of
  lookup for the feature passed to it. It should only be used on a table of
  width 1. The gradient fed back to the TPU embedding should always be zero.
  This can be acomplished via using `tf.stop_gradients` on the feature before
  using it.

  You must use the dynamic learning rate mechanism to set the 'learning rate'
  for this table to be the a float32 cast of the global training step counter.

  See `tensorflow/core/protobuf/tpu/optimization_parameters.proto` for more
  details on this optimizer.

  Pass this to `tf.estimator.tpu.experimental.EmbeddingConfigSpec` via the
  `optimization_parameters` argument to set the optimizer and its parameters.
  See the documentation for `tf.estimator.tpu.experimental.EmbeddingConfigSpec`
  for more details.

  ```
  estimator = tf.estimator.tpu.TPUEstimator(
      ...
      embedding_spec=tf.estimator.tpu.experimental.EmbeddingConfigSpec(
          ...
          optimization_parameters=FrequencyEstimatorParameters(0.1),
          ...))
  ```

  tau	max_deltaoutlier_thresholdweight_exponentc                    s4   t  jddddddd || _|| _|| _|| _dS )a  Optimization parameters for frequency estimator.

    Args:
      tau: Learning rate between (0, 1) that is used to update the array.
      max_delta: Maximum value of delta, the difference between the current
        global step and the last global step at which the row was sampled.
      outlier_threshold: Threshold used to determine whether the current update
        is an outlier.
      weight_exponent: The weight exponent used to transform the estimated delta
        into weights.
    r   TN)r*   r}   r~   r   r   r   )r"   r   r   r   r   r   )r   r   r   r   r   r-   r/   r0   r     s   
z%FrequencyEstimatorParameters.__init__)r2   r3   r4   r5   r   r   r6   r/   r/   r-   r0   r   c  s
    r   DeviceConfig)	num_hosts	num_coresjob_namec                   @   s   e Zd ZdZ								d+ddZedd Zed	d
 Zedd Zedd Z	edd Z
edd Zedd Zedd Zedd Zdd Z		d,ddZ		d-ddZdd  Z		d-d!d"Zd#d$ Zd%d& Zd.d'd(Zd)d* ZdS )/TPUEmbeddinga  API for using TPU for embedding.

    Example:
    ```
    table_config_user = tpu_embedding.TableConfig(
        vocabulary_size=4, dimension=2,
        initializer=initializer, combiner='mean')
    table_to_config_dict = {'video': table_config_video,
                          'user': table_config_user}
    feature_to_config_dict = {'watched': tpu_embedding.FeatureConfig('video'),
                              'favorited': tpu_embedding.FeatureConfig('video'),
                              'friends': tpu_embedding.FeatureConfig('user')}
    batch_size = 4
    num_hosts = 1
    optimization_parameters = tpu_embedding.AdagradParameters(1., 1.)
    mode = tpu_embedding.TRAINING
    embedding = tpu_embedding.TPUEmbedding(
        table_to_config_dict, feature_to_config_dict,
        batch_size, num_hosts, mode, optimization_parameters)

    batch_size_per_core = embedding.batch_size_per_core
    sparse_features_list = []
    for host in hosts:
      with ops.device(host):
        for _ in range(embedding.num_cores_per_host):
          sparse_features = {}
          sparse_features['watched'] = sparse_tensor.SparseTensor(...)
          sparse_features['favorited'] = sparse_tensor.SparseTensor(...)
          sparse_features['friends'] = sparse_tensor.SparseTensor(...)
          sparse_features_list.append(sparse_features)

    enqueue_ops = embedding.generate_enqueue_ops(sparse_features_list)
    embedding_variables_and_ops = embedding.create_variables_and_ops()

    def computation():
      activations = embedding.get_activations()
      loss = compute_loss(activations)

      base_optimizer = gradient_descent.GradientDescentOptimizer(
          learning_rate=1)
      cross_shard_optimizer = tpu_optimizer.CrossShardOptimizer(
          base_optimizer)

      train_op = cross_shard_optimizer.minimize(loss)
      gradients = (
          tpu_embedding_gradient.get_gradients_through_compute_gradients(
              cross_shard_optimizer, loss, activations)
      send_gradients_op = embedding.generate_send_gradients_op(gradients)
      with ops.control_dependencies([train_op, send_gradients_op]):
        loss = array_ops.identity(loss)

    loss = tpu.shard(computation,
                     num_shards=embedding.num_cores)

    with self.test_session() as sess:
      sess.run(tpu.initialize_system(embedding_config=
                                     embedding.config_proto))
      sess.run(variables.global_variables_initializer())
      sess.run(embedding_variables_and_ops.load_ops())
      sess.run(enqueue_ops)
      loss_val = sess.run(loss)
    ```

  Example with weight decay:

  >>> def learning_rate_fn(global_step):
  ...   return tf.compat.v1.train.polynomial_decay(
  ...     learning_rate=5e-5,
  ...     global_step=global_step,
  ...     decay_steps=100000,
  ...     end_learning_rate=0.0)
  >>> wordpiece_table_config = TableConfig(
  ...   vocabulary_size=119547,
  ...   dimension=256,
  ...   learning_rate_fn=learning_rate_fn)
  >>> wordpiece_feature_config = FeatureConfig(
  ...   table_id='bert/embeddings/word_embeddings',
  ...   max_sequence_length=512)
  >>> optimization_parameters = AdamParameters(
  ...   learning_rate=5e-5,
  ...   epsilon=1e-6,
  ...   weight_decay_factor=0.01,
  ...   multiply_weight_decay_factor_by_learning_rate=True)
  >>> tpu_embedding = TPUEmbedding(
  ...  table_to_config_dict={
  ...    'bert/embeddings/word_embeddings': wordpiece_table_config,
  ...  },
  ...  feature_to_config_dict={'input_ids': wordpiece_feature_config},
  ...  batch_size=128,
  ...  mode=TRAINING,
  ...  optimization_parameters=optimization_parameters,
  ...  master='')
  >>> with tf.Graph().as_default():
  ...   init_tpu_op = tf.compat.v1.tpu.initialize_system(
  ...     embedding_config=tpu_embedding.config_proto)
  ...   tf.compat.v1.Session().run(init_tpu_op)
  NFdivc              
      sr  |	dvrt d|	 d|	| _|
| _t| t|| _t|| t|| _t| j| _	t
| j| j	| _|| _|du rs|du rs du rFt d j j rVt d j j j| _ j| _| j| j | _ fddt| jD | _n^tj||d	}|jd
krt d||j| _|du rzt||}W n t y } zt t|d d}~ww g | _|jD ]}d|jv r|du s||jv r| j|j q|j| _|j| _t| j| j | j| j | _|tkrt || j || _!n |t"kr|durt d| dt#d| _!n	t dtt"||| _$| % | _&|| _'t(t)dd | j* D | _+dd t,| j+D | _-| . | _/dS )aA  API for using TPU for embedding lookups.

    Args:
      table_to_config_dict: A dictionary mapping from string of table name to
        `TableConfig`. Table refers to an embedding table, e.g. `params`
        argument to `tf.nn.embedding_lookup_sparse()`.
      feature_to_config_dict: A dictionary mapping from string of feature name
        to `FeatureConfig`. Feature refers to ids to lookup in embedding table,
        e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`.
      batch_size: An `int` representing the global batch size.
      mode: `TRAINING` or `INFERENCE`.
      master: A `string` representing the TensorFlow master to use.
      optimization_parameters: `AdagradParameters`, `AdamParameters`,
        `Stochasticgradientdescentparameters`. Must be set in training unless
        all tables specify their own optimizers. And it must be `None` in
        inference.
      cluster_def: A ClusterDef object describing the TPU cluster.
      pipeline_execution_with_tensor_core: setting this to `True` makes training
        faster, but trained model will be different if step N and step N+1
        involve the same set of embedding IDs. Please see
        `tpu_embedding_configuration.proto` for details.
      partition_strategy: A string, either 'mod' or 'div', specifying how to map
        the lookup id to the embedding tensor. For more information see
        `tf.nn.embedding_lookup_sparse`.
      profile_data_directory: Directory where embedding lookup statistics are
        stored. These statistics summarize information about the inputs to the
        embedding lookup operation, in particular, the average number of
        embedding IDs per example and how well the embedding IDs are load
        balanced across the system. The lookup statistics are used during TPU
        initialization for embedding table partitioning. Collection of lookup
        statistics is done at runtime by  profiling the embedding inputs, only a
        small fraction of input samples are profiled to minimize host CPU
        overhead. Once a suitable number of samples are profiled, the lookup
        statistics are saved to table-specific files in the profile data
        directory generally at the end of a TPU training loop. The filename
        corresponding to each table is obtained by hashing table specific
        parameters (e.g., table name and number of features) and global
        configuration parameters (e.g., sharding strategy and task count). The
        same profile data directory can be shared among several models to reuse
        embedding lookup statistics.
      device_config: A DeviceConfig instance, used when `master` and
        `cluster_def` are both `None`.
      master_job_name: if set, overrides the master job name used to schedule
        embedding ops.

    Raises:
      ValueError: if any input is invalid.
    )r   modz5partition_strategy must be "div" or "mod". Received: r   NzOWhen master and cluster_def are both None,device_config must be set but is not.z9num_hosts ({}) should divide num_cores ({}) but does not.c                    s   g | ]	}d   j|qS )z!{}/replica:0/task:{}/device:CPU:0)r   r   )rR   idevice_configr/   r0   
<listcomp>r  s    z)TPUEmbedding.__init__.<locals>.<listcomp>)cluster_defr   z:TPUEmbedding needs TPUs, but master {} does not have TPUs.z" Please specify a master_job_name.zdevice:CPU:zI`optimization_parameters` should be `None` for inference mode. Received: r   z'`mode` only supports {} and {}; got {}.c                 s   s     | ]}|j d ur|j V  qd S rJ   )r+   )rR   cr/   r/   r0   rU     s
    z(TPUEmbedding.__init__.<locals>.<genexpr>c                 S      i | ]\}}||qS r/   r/   )rR   idfnr/   r/   r0   
<dictcomp>      
z)TPUEmbedding.__init__.<locals>.<dictcomp>)0r   _partition_strategy_profile_data_directory_validate_table_to_config_dict_create_ordered_dict_table_to_config_dict _validate_feature_to_config_dict_feature_to_config_dict_create_table_to_features_dict_table_to_features_dict_create_combiners
_combinersZ_batch_sizer   r   r   
_num_hosts
_num_cores_num_cores_per_hostrange_hoststpu_system_metadata_libZ_query_tpu_system_metadataZ
master_jobstrZdevicesnamerZ   Znum_of_cores_per_host_validate_batch_size_batch_size_per_coreTRAINING!_validate_optimization_parameters_optimization_parameters	INFERENCEr   _mode_get_optimizer_handler_by_table_optimizer_handler_dict$_pipeline_execution_with_tensor_corelistsetrD   _learning_rate_fn	enumerate_learning_rate_fn_to_tag_create_config_proto_config_proto)r   table_to_config_dictfeature_to_config_dict
batch_sizemodeZmasterr,   r   #pipeline_execution_with_tensor_coreZpartition_strategyprofile_data_directoryr   Zmaster_job_namer   edevicer/   r   r0   r     s   =









zTPUEmbedding.__init__c                 C      t  | jS )zdA list of device names for CPU hosts.

    Returns:
      A list of device names for CPU hosts.
    )copyr   r   r/   r/   r0   hosts  s   zTPUEmbedding.hostsc                 C      | j S )z^Number of TPU cores on a CPU host.

    Returns:
      Number of TPU cores on a CPU host.
    )r   r   r/   r/   r0   num_cores_per_host     zTPUEmbedding.num_cores_per_hostc                 C   r   )zhTotal number of TPU cores on all hosts.

    Returns:
      Total number of TPU cores on all hosts.
    )r   r   r/   r/   r0   r     r  zTPUEmbedding.num_coresc                 C   r   )zBatch size for each TPU core.

    The sparse tensors in `sparse_features_list` to `generate_enqueue_ops`
       must have batch dimension equal to this.

    Returns:
      Batch size for each TPU core.
    )r   r   r/   r/   r0   batch_size_per_core  s   
z TPUEmbedding.batch_size_per_corec                 C   r   )a  Create embedding config proto for `tpu.initialize_system()`.

    Returns:
      an `TPUEmbeddingConfiguration` proto describing the desired
         configuration of the hardware embedding lookup tables, which
         is passed to `tpu.initialize_system()`.
    )r   r   r/   r/   r0   config_proto  s   	zTPUEmbedding.config_protoc                 C   r   rJ   )r   r   r   r/   r/   r0   r        z!TPUEmbedding.table_to_config_dictc                 C   r   rJ   )r   r   r   r/   r/   r0   r     r  z#TPUEmbedding.feature_to_config_dictc                 C   r   rJ   )r   r   r   r/   r/   r0   table_to_features_dict  r  z#TPUEmbedding.table_to_features_dictc                 C   r   rJ   r   r   r/   r/   r0   r,        z$TPUEmbedding.optimization_parametersc                 C   s   t  }| jD ]}|j }||_| j| }t|jt| j	|_|j
|_
| j|  }|j}|jr7|j|j_n|jrD| j|j |jj_n|j|j_|jrPtjjntjj|_|jdur`|j|jj_|jdurk|j|jj_|j durv|j |j!j_|j"dur|j"|j!j_|j#r|j#|_#|j$rd|_$|j%rtj&j|j'_(| j| }|)| qdd t*| jD }| j+D ]8}| j+| }	|	D ].}
|j, }|| j-|
 j. |_.| j-|
 j/dkr|j01| j2| j-|
 j/g q|j01| j2g qq| j3|_4| j5|_6| j7|_8| j9dkrt jj:nt jj;|_<| j=|_>| j?r| j?|_@|S )z#Create `TPUEmbeddingConfiguration`.NTc                 S   r   r/   r/   )rR   r   tabler/   r/   r0   r   .  r   z5TPUEmbedding._create_config_proto.<locals>.<dictcomp>r   r   )AelcTPUEmbeddingConfigurationr   table_descriptoraddr   maxr%   lenr   r&   r   get_optimization_parametersr,   r*   Zconstantr+   r   Zdynamictagr}   r   ZGradientAccumulationStatusZENABLEDZDISABLEDZgradient_accumulation_statusr   Zgradient_clipping_limitslowervaluer   upperr~   Zclipping_limitsr   r   r   r)   ZHotIdReplicationConfigurationZ hot_id_replication_configurationstatusset_optimization_parametersr   r   feature_descriptorr   r8   r9   Zinput_shapeextendr   r   r   r   r   r   Znum_tensor_coresr   ZDIV_DEFAULTZMODZsharding_strategyr   r   r   r   )r   r  r  r  table_configr,   
parametersoptimizer_handlerZtable_to_idfeaturesfeaturer  r/   r/   r0   r     s   















z!TPUEmbedding._create_config_protoc              
      sR  i }i }g  g t | jD ]\}}|r|| }n|}|r!|| }n
| j| }	|	|}t r2d}
nt| j}
t	|
Q t
|| j| j| j| j| j| j| jtjjgd}|||< |r_dn| j }| j| ||| j| j| ||\}}}|||<  | | W d   n1 sw   Y  q fdd}fdd}t||||S )a  Create embedding and slot variables, with ops to load and retrieve them.

    N.B.: the retrieve embedding variables (including slot variables) ops are
    returned as lambda fn, as the call side might want to impose control
    dependencies between the TPU computation and retrieving actions. For
    example, the following code snippet ensures the TPU computation finishes
    first, and then we pull the variables back from TPU to CPU.

    ```
    updates_ops = []
    with ops.control_dependencies([loss]):
      for op_fn in retrieve_parameters_op_fns:
        update_ops.append(op_fn())
    ```

    Args:
      embedding_variable_name_by_table: A dictionary mapping from string of
        table name to string of embedding variable name. If `None`, defaults
        from `get_default_slot_variable_names()` will be used.
      slot_variable_names_by_table: A dictionary mapping from string of table
        name to `AdamSlotVariableNames`, `AdagradSlotVariableNames` etc. If
        `None`, defaults from `get_default_slot_variable_names()` will be used.

    Returns:
      `tpu_embedding.VariablesAndOps` with:
        A dictionary mapping from string of table name to embedding variables,
        A dictionary mapping from string of table name to AdagradSlotVariables,
         AdamSlotVariables etc with slot variables,
        A function which returns a list of ops to load embedding and slot
         variables from CPU to TPU.
        A function which returns a list of ops to retrieve embedding and slot
         variables from TPU to CPU.
     )r   r   r%   embedding_dimensionr'   rW   Nc                        g }  D ]}|  |  q| S )zCalls and returns the load ops for each embedding table.

      Returns:
        A list of ops to load embedding and slot variables from CPU to TPU.
      r  )Zload_ops_listZ
load_op_fn)load_op_fnsr/   r0   r{        z7TPUEmbedding.create_variables_and_ops.<locals>.load_opsc                     r  )zCalls and returns the retrieve ops for each embedding table.

      Returns:
        A list of ops to retrieve embedding and slot variables from TPU to CPU.
      r   )Zretrieve_ops_listZretrieve_op_fn)retrieve_op_fnsr/   r0   r|     r"  z;TPUEmbedding.create_variables_and_ops.<locals>.retrieve_ops)r   r   r   get_default_slot_variable_namesr   Zexecuting_eagerly_create_device_fnr   r   r   _create_partitioned_variablesr   r%   r&   r'   	GraphKeysGLOBAL_VARIABLESr  SerializeToStringcreate_variables_and_opsrZ   rx   )r   Z embedding_variable_name_by_tableZslot_variable_names_by_tablery   rz   r   r  Zembedding_variable_nameslot_variable_namesr  r   table_variablesconfigZslot_variables_for_tableload_ops_fnretrieve_ops_fnr{   r|   r/   )r!  r#  r0   r*  O  sT   $








z%TPUEmbedding.create_variables_and_opsc                    s$    |  fddt|D S )a  Generate enqueue ops.

    Args:
      enqueue_datas_list: a list of dictionary mapping from string of feature
        names to EnqueueData. Each dictionary is for one TPU core. Dictionaries
        for the same host should be contiguous in the list.
      mode_override: A string input that overrides the mode specified in the
        TPUEmbeddingConfiguration. Supported values are {'unspecified',
        'inference', 'training', 'backward_pass_only'}. When set to
        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
        otherwise mode_override is used (optional).
      ragged: If True, creates RaggedTensor enqueue ops rather than
        SparseTensor.

    Returns:
      Ops to enqueue to TPU for embedding.
    c                    s(   g | ]\}}j ||j  d qS ))device_ordinalmode_overrideragged)_generate_enqueue_opr   )rR   r   r\   r1  r2  r   r/   r0   r     s    z5TPUEmbedding.generate_enqueue_ops.<locals>.<listcomp>)1_validate_generate_enqueue_ops_enqueue_datas_listr   )r   r[   r1  r2  r/   r4  r0   generate_enqueue_ops  s   
z!TPUEmbedding.generate_enqueue_opsc              
   C   s  dd }t | j }d}t|D ]\}}t | }|| }|r)td|||| }	|	r7td||	d}
d}| D ]\}}| j| j| j j	}t
|trv|jdu re|retd|| j| j| ||jd|| ||jd|| n1t
|tr|jdu r|rtd	|| j| j| ||jd
|| ||jd|| ntd|||
du r|jj}
|}q?|
|jjkrtd||
|jj||q?|| j r|
|krtd||
|q|
}qdS )zValidate `enqueue_datas_list`.c                 S   s.   | dur| j |jj krtd||dS dS )z*Helper function to check device agreement.NzKDevice of {0} does not agree with that ofembedding_indices for feature {1}.)r   r?   r   r   )datar   r  enqueue_datar/   r/   r0   _check_agreement  s   zXTPUEmbedding._validate_generate_enqueue_ops_enqueue_datas_list.<locals>._check_agreementNzR`enqueue_datas_list[{}]` misses a feature that is in `feature_to_config_dict`: {}.zS`enqueue_datas_list[{}]` has a feature that is not in `feature_to_config_dict`: {}.zINo sample indices set for features %f table %f but combiner is set to %s.r@   rA   zENo row splits set for features %f table %f but combiner is set to %s.rM   zp`enqueue_datas_list[{}]` has a feature that is not mapped to `EnqueueData` or `RaggedEnqueueData`. `feature`: {}zfDevices are different between features in `enqueue_datas_list[{}]`; devices: {}, {}; features: {}, {}.zWe expect the `enqueue_datas` which are on the same host to be contiguous in `enqueue_datas_list`, `enqueue_datas_list[{}]` is on device {}, but is expected to be on device {}.)r   r   keysr   r   r   rY   r   r8   r(   r   r<   r@   loggingwarnrA   rL   rM   r?   r   r   )r   r[   r9  Zfeature_setZcontiguous_devicer   r\   Zused_feature_setZmissing_feature_setZextra_feature_setr   Zdevice_featurer  r8  r(   r/   r/   r0   r5    s   




z>TPUEmbedding._validate_generate_enqueue_ops_enqueue_datas_listc                 C   sb   t | d }t|j tjd|| j|d| ||W  d   S 1 s*w   Y  dS )z&Creates op for enqueuing batch to TPU.r   )r0  	combinersr1  Nr/   )	r   rD   r   colocate_withr?   r   Z,enqueue_tpu_embedding_arbitrary_tensor_batchr   0_format_for_tpu_embedding_arbitrary_tensor_batch)r   r\   r0  r1  r2  Zenqueue_data0r/   r/   r0   r3  1  s   $z!TPUEmbedding._generate_enqueue_opc                 C   s:  g g g d}t jdtjd}t jdtjd}| jD ]}| j| }|D ]w}|| }	|r:|d |	jdur6|	jn| nH| j| j	dkre|	j
dure|	j
jd dkret j|	j
ddgddggd	}
|d |
 n|	j
du sr|	j
jd dkrz|d | n|d |	j
 |d
 |	jdur|	jn| |d |	j q"q|S )aY  Format features for `enqueue_tpu_embedding_arbitrary_tensor_batch()`.

    Args:
      enqueue_datas: a `Dict` of `RaggedEnqueueData` objects for embedding.
      ragged: If True, extract row splits from the data rather than sample
        indices.

    Returns:
      Dict of arguments for `enqueue_tpu_embedding_arbitrary_tensor_batch()`.
    )sample_indices_or_row_splitsr?   rA   )r   dtyper@  Nr   r   r   )ZpaddingsrA   r?   )r   Zzerosr   Zint64float32r   rZ   rM   r   r9   r@   shapepadrA   r?   )r   r\   r2  kwargsZ	int_zerosZfloat_zerosr  r  r  r8  r@   r/   r/   r0   r?  @  sT   




z=TPUEmbedding._format_for_tpu_embedding_arbitrary_tensor_batchc                 C   sZ   t jt| j| j d}t }d}| jD ]}| j| D ]}|| ||< |d7 }qq|S )zGet activations for features.

    This should be called within `computation` that is passed to
      `tpu.replicate` and friends.

    Returns:
      A dictionary mapping from `String` of feature name to `Tensor`
        of activation.
    )Znum_outputsr-  r   r   )	r   Zrecv_tpu_embedding_activationsr  r   r   r)  rW   rX   r   )r   Zrecv_activationsZactivationsindexr  r  r/   r/   r0   get_activationsu  s   


zTPUEmbedding.get_activationsc                    s   | j tkrtd| j  du r| jrtdg }| jD ]}| j| D ]	}|||  q$qtj	| fdd| jD | j
 dS )aD  Send gradient to TPU embedding.

    Args:
      feature_to_gradient_dict: dict mapping feature names to gradient wrt
        activations.
      step: the current global step, used for dynamic learning rate.

    Returns:
      SendTPUEmbeddingGradients Op.

    Raises:
      RuntimeError: If `mode` is not `TRAINING`.
    zNOnly in training mode gradients need to be sent to TPU embedding; got mode {}.Nz2There are dynamic learning rates but step is None.c                    s    g | ]}t j| tjd qS )rA  )r   castr   rC  )rR   r   stepr/   r0   r     s    z;TPUEmbedding.generate_send_gradients_op.<locals>.<listcomp>)ZinputsZlearning_ratesr-  )r   r   RuntimeErrorr   r   r   r   rZ   r   Zsend_tpu_embedding_gradientsr  r)  )r   Zfeature_to_gradient_dictrK  Z	gradientsr  r  r/   rJ  r0   generate_send_gradients_op  s(   


z'TPUEmbedding.generate_send_gradients_opc                 C   s@   i }| j  D ]\}}|jd ur|j}n| j}t|||< q|S rJ   )r   rY   r,   r   _get_optimization_handler)r   Zoptimizer_handlersr  r  Z	optimizerr/   r/   r0   r     s   
z,TPUEmbedding._get_optimizer_handler_by_table)NNNFr   NNNrI   )NFrJ   )r2   r3   r4   r5   r   propertyr   r   r   r  r  r   r   r  r,   r   r*  r6  r5  r3  r?  rH  rM  r   r/   r/   r/   r0   r     sZ    x
 










Z
n
!Y
5
"r   c                 C   s4   |   D ]\}}t|tstdt||qdS )z Validate `table_to_config_dict`.zMValue of `table_to_config_dict` must be of type `TableConfig`, got {} for {}.N)rY   r   r   r   r   r!   )r   rS   rT   r/   r/   r0   r     s   
r   c                 C   sZ   t dd | D }t |  }|| }|rtd||| }|r+td|dS )z"Validate `feature_to_config_dict`.c                 S   s   g | ]}|j qS r/   )r8   )rR   r  r/   r/   r0   r     s    z4_validate_feature_to_config_dict.<locals>.<listcomp>zX`table_to_config_dict` specifies table that is not used in `feature_to_config_dict`: {}.z_`feature_to_config_dict` refers to a table that is not specified in `table_to_config_dict`: {}.N)r   rD   r:  r   r   )r   r   Zused_table_setZ	table_setZunused_table_setZextra_table_setr/   r/   r0   r     s"   r   c                 C   s   | | rt d| |d S )NzT`batch_size` is not a multiple of number of cores. `batch_size`={}, `_num_cores`={}.)r   r   )r   r   r/   r/   r0   r     s   r   c                 C   s\   d}|  D ]\}}|jdu rd} qq| r&t| ts$tdt| dS |r,tddS )a_  Validate global optimization_parameters and per table optimizers.

  If global optimizer is `None`, all table optimizers should be non `None`.

  Args:
      optimization_parameters: global optimizer provided in `TPUEmbedding`
        constructor.
      table_to_config_dict: A dictionary mapping from string of table name to
        `TableConfig`.
  FNTzi`optimization_parameters` must inherit from `_OptimizationParameters`. `type(optimization_parameters)`={}z%`optimization_parameters` is missing.)rY   r,   r   r    r   r   r!   )r,   r   Ztbl_optimizer_missing_r  r/   r/   r0   r     s"   

r   c                   @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )_OptimizerHandlerz6Interface class for handling optimizer specific logic.c                 C   s
   || _ d S rJ   r  )r   r,   r/   r/   r0   r     s   
z_OptimizerHandler.__init__c                 C   r   rJ   r  r   r/   r/   r0   r        z-_OptimizerHandler.get_optimization_parametersc                 C      t  rJ   NotImplementedErrorr   r  r/   r/   r0   r    rR  z-_OptimizerHandler.set_optimization_parametersc                 C   rS  rJ   rT  r   r  r/   r/   r0   r$    rR  z1_OptimizerHandler.get_default_slot_variable_namesc                 C   rS  rJ   rT  )r   r  r+  r   r  r,  r  r/   r/   r0   r*  	  r  z*_OptimizerHandler.create_variables_and_opsN)	r2   r3   r4   r5   r   r  r  r$  r*  r/   r/   r/   r0   rQ    s    rQ  c                   @   (   e Zd ZdZdd Zdd Zdd ZdS )	_AdagradHandlerzHandles Adagrad specific logic.c                 C      |j j  d S rJ   )r,   ZadagradSetInParentrV  r/   r/   r0   r       z+_AdagradHandler.set_optimization_parametersc                 C      t d|dS )N{}/{}ZAdagrad)ra   r   rW  r/   r/   r0   r$    r\  z/_AdagradHandler.get_default_slot_variable_namesc                    h   t | jj}t|j|j|jtj	j
g|d t } fdd}	 fdd}
||	|
fS )Nr   r   r%   r  rW   r'   c               
      v   } g }t t D ],\}}}t| tj|||| d}W d   n1 s,w   Y  d} || q|S )Returns the retrieve ops for AdaGrad embedding tables.

      Returns:
        A list of ops to load embedding and slot variables from CPU to TPU.
      r  accumulators
table_name
num_shardsshard_idr-  N)zipr   r   r>  r   Z%load_tpu_embedding_adagrad_parametersrZ   r-  load_op_listhost_idtable_variableaccumulator_variableload_parameters_opaccumulator_variablesr  r   r  r,  r/   r0   r.  $  &   
	z=_AdagradHandler.create_variables_and_ops.<locals>.load_ops_fnc               	         } g }t t D ]:\}}}t|! tj|| d\}}tt	||t	||}W d   n1 s:w   Y  d} |
| q|S )zReturns the retrieve ops for AdaGrad embedding tables.

      Returns:
        A list of ops to retrieve embedding and slot variables from TPU to CPU.
      re  rf  rg  r-  N)rh  r   r   r>  r   Z)retrieve_tpu_embedding_adagrad_parametersr	   groupr   assignrZ   r-  retrieve_op_listrk  rl  rm  retrieved_tableretrieved_accumulatorretrieve_parameters_opro  r/   r0   r/  ;  *   



zA_AdagradHandler.create_variables_and_ops.<locals>.retrieve_ops_fn)r
   constant_initializerr   r   r&  rb   r%   r&   r   r'  r(  rs   r   r  r+  r   r  r,  r  accumulator_initializerslot_variablesr.  r/  r/   ro  r0   r*       
z(_AdagradHandler.create_variables_and_opsNr2   r3   r4   r5   r  r$  r*  r/   r/   r/   r0   rY    s
    rY  c                   @   rX  )	_AdagradMomentumHandlera  Handles Adagrad with Momentum specific logic.

  Creates slot variables and defines their initializers. Defines load/retrieve
  operations to be used for loading variables into TPU memory (from host memory)
  and retrieving variables from TPU memory (into host memory).
  c                 C   sV   |j j  | jj|j j_| jj|j j_| jj|j j_| jj|j j_| jj|j j_d S rJ   )	r,   Zadagrad_momentumr[  r   r   r   r   r   r   rV  r/   r/   r0   r  ^     z3_AdagradMomentumHandler.set_optimization_parametersc                 C      t d|dd|dS )Nz{}/{}/AccumulatorZAdagradMomentumz{}/{}/Momentum)re   r   rW  r/   r/   r0   r$  k     

z7_AdagradMomentumHandler.get_default_slot_variable_namesc                    s   t  }t|j|j|jtjjg|d t  }t|j	|j|jtjjg|dt
 }	 fdd}
 fdd}|	|
|fS )Nr`  c                     |   } g }t t D ].\}}}}t| tj||||| d}W d   n1 s/w   Y  d} || q|S )zReturns the load ops for AdaGrad with momentum embedding tables.

      Returns:
        A list of ops to load embedding and slot variables from CPU to TPU.
      )r  rd  rd   re  rf  rg  r-  N)rh  r   r   r>  r   Z.load_tpu_embedding_adagrad_momentum_parametersrZ   )r-  rj  rk  rl  rm  momenta_variablern  rp  r  momenta_variablesr   r  r,  r/   r0   r.    *   

zE_AdagradMomentumHandler.create_variables_and_ops.<locals>.load_ops_fnc            
   
         } g }t t D ]A\}}}}t|' tj|| d\}}}tt	||t	||t	||}	W d   n1 sBw   Y  d} |
|	 q|S )zReturns the retrieve ops for AdaGrad with momentum embedding tables.

      Returns:
        A list of ops to retrieve embedding and slot variables from TPU to CPU.
      rs  N)rh  r   r   r>  r   Z2retrieve_tpu_embedding_adagrad_momentum_parametersr	   rt  r   ru  rZ   )
r-  rw  rk  rl  rm  r  rx  ry  retrieved_momentarz  r  r/   r0   r/    s0   



zI_AdagradMomentumHandler.create_variables_and_ops.<locals>.retrieve_ops_fn)r
   zeros_initializerr&  rb   r%   r&   r   r'  r(  rd   rq   )r   r  r+  r   r  r,  r  r~  momenta_initializerr  r.  r/  r/   r  r0   r*  p  s0   
z0_AdagradMomentumHandler.create_variables_and_opsNr  r/   r/   r/   r0   r  V  s
    r  c                   @   rX  )	_ProximalAdagradHandlerz'Handles ProximalAdagrad specific logic.c                 C   s,   |j j  | jj|j j_| jj|j j_d S rJ   )r,   Zproximal_adagradr[  r   r   l1r   l2rV  r/   r/   r0   r    
   z3_ProximalAdagradHandler.set_optimization_parametersc                 C   r]  )Nr^  ZProximalAdagrad)ri   r   rW  r/   r/   r0   r$    s   z7_ProximalAdagradHandler.get_default_slot_variable_namesc                    r_  )Nr`  c               
      ra  )zReturns the retrieve ops for Proximal AdaGrad embedding tables.

      Returns:
        A list of ops to load embedding and slot variables from CPU to TPU.
      rc  N)rh  r   r   r>  r   Z.load_tpu_embedding_proximal_adagrad_parametersrZ   ri  ro  r/   r0   r.    rq  zE_ProximalAdagradHandler.create_variables_and_ops.<locals>.load_ops_fnc               	      rr  )zReturns the retrieve ops for Proximal AdaGrad embedding tables.

      Returns:
        A list of ops to retrieve embedding and slot variables from TPU to CPU.
      rs  N)rh  r   r   r>  r   Z2retrieve_tpu_embedding_proximal_adagrad_parametersr	   rt  r   ru  rZ   rv  ro  r/   r0   r/    r{  zI_ProximalAdagradHandler.create_variables_and_ops.<locals>.retrieve_ops_fn)r
   r|  r   r   r&  rb   r%   r&   r   r'  r(  rt   r}  r/   ro  r0   r*    r  z0_ProximalAdagradHandler.create_variables_and_opsNr  r/   r/   r/   r0   r    s
    r  c                   @   rX  )	_AdamHandlerzHandles Adam specific logic.c                 C   sL   | j j|jj_| j j|jj_| j j|jj_| j j |jj_| j j|jj_	d S rJ   )
r   r   r,   Zadamr   r   r   Zuse_non_lazy_adamr   Zuse_sum_inside_sqrtrV  r/   r/   r0   r  	  s   z(_AdamHandler.set_optimization_parametersc                 C   r  )Nz{}/{}/mZAdamz{}/{}/v)r_   r   rW  r/   r/   r0   r$  	     
z,_AdamHandler.get_default_slot_variable_namesc                    s   t  }t|j|j|jtjjg|dt  }t|j	|j|jtjjg|dt
}	 fdd}
 fdd}|	|
|fS )Nr`  c                     s|   g }  }t tD ].\}}}}t| tj|||||d}W d   n1 s/w   Y  d}| | q| S )rb  )r  rd   Z
velocitiesre  rf  rg  r-  N)rh  r   r   r>  r   Z"load_tpu_embedding_adam_parametersrZ   )rj  r-  rk  rl  
m_variable
v_variablern  r  Zm_variablesr   r  r,  Zv_variablesr/   r0   r.  0	  (   z:_AdamHandler.create_variables_and_ops.<locals>.load_ops_fnc            
   
      s   g }  }t tD ]A\}}}}t|' tj||d\}}}tt	||t	||t	||}	W d   n1 sBw   Y  d}| 
|	 q| S )zReturns the retrieve ops for Adam embedding tables.

      Returns:
        A list of ops to retrieve embedding and slot variables from TPU to CPU.
      rs  N)rh  r   r   r>  r   Z&retrieve_tpu_embedding_adam_parametersr	   rt  r   ru  rZ   )
rw  r-  rk  rl  r  r  rx  retrieved_mretrieved_vrz  r  r/   r0   r/  J	  ,   


z>_AdamHandler.create_variables_and_ops.<locals>.retrieve_ops_fn)r
   r  r&  r`   r%   r&   r   r'  r(  rT   ro   )r   r  r+  r   r  r,  r  m_initializerv_initializerr  r.  r/  r/   r  r0   r*  	  s,   

z%_AdamHandler.create_variables_and_opsNr  r/   r/   r/   r0   r  		  s
    r  c                   @   rX  )	_FtrlHandlerzHandles Ftrl specific logic.c                 C   sX   | j j|jj_| j j|jj_| j j|jj_| j j	|jj_
| j j|jj_| j j|jj_d S rJ   )r   r   r,   ZftrlZlr_powerr   r  r   r  r   Zmultiply_linear_by_lrr   r   rV  r/   r/   r0   r  i	  s   z(_FtrlHandler.set_optimization_parametersc                 C   s   t d|dd|dS )Nr^  ZFtrlZFtrl_1)rj   r   rW  r/   r/   r0   r$  w	  s   

z,_FtrlHandler.get_default_slot_variable_namesc                    s   t | jj}t|j|j|jtj	j
g|d t | jj}t|j|j|jtj	j
g|dt }	 fdd}
 fdd}|	|
|fS )Nr`  c                     r  )zReturns the retrieve ops for Ftrl embedding tables.

      Returns:
        A list of ops to load embedding and slot variables from CPU to TPU.
      )r  rd  Zlinearsre  rf  rg  r-  N)rh  r   r   r>  r   Z"load_tpu_embedding_ftrl_parametersrZ   )r-  rj  rk  rl  rm  linear_variablern  rp  r  Zlinear_variablesr   r  r,  r/   r0   r.  	  r  z:_FtrlHandler.create_variables_and_ops.<locals>.load_ops_fnc            
   
      r  )zReturns the retrieve ops for Ftrl embedding tables.

      Returns:
        A list of ops to retrieve embedding and slot variables from TPU to CPU.
      rs  N)rh  r   r   r>  r   Z&retrieve_tpu_embedding_ftrl_parametersr	   rt  r   ru  rZ   )
r-  rw  rk  rl  rm  r  rx  ry  Zretrieved_linearrz  r  r/   r0   r/  	  s.   



z>_FtrlHandler.create_variables_and_ops.<locals>.retrieve_ops_fn)r
   r|  r   r   r&  rb   r%   r&   r   r'  r(  r   rk   ru   )r   r  r+  r   r  r,  r  r~  Zlinear_initializerr  r.  r/  r/   r  r0   r*  ~	  s4   

z%_FtrlHandler.create_variables_and_opsNr  r/   r/   r/   r0   r  f	  s
    r  c                   @   rX  )	_ProximalYogiHandlerz%Handles Proximal Yogi specific logic.c                 C   sV   |j j  | jj|j j_| jj|j j_| jj|j j_| jj|j j_| jj	|j j_
d S rJ   )r,   Zproximal_yogir[  r   r   r   r   r   r  r   r  rV  r/   r/   r0   r  	  r  z0_ProximalYogiHandler.set_optimization_parametersc                 C   r  )Nr^  ZProximalYogiz{}/{}_1)rl   r   rW  r/   r/   r0   r$  	  r  z4_ProximalYogiHandler.get_default_slot_variable_namesc                    s   t | jj}t|j|j|jtj	j
g|dt  }t|j|j|jtj	j
g|dt}	 fdd}
 fdd}|	|
|fS )Nr`  c                     s|   g }  }t tD ].\}}}}t| tj|||||d}W d   n1 s/w   Y  d}| | q| S )zReturns the load ops for Proximal Yogi embedding tables.

      Returns:
        A list of ops to load embedding and slot variables from CPU to TPU.
      )r  rT   r`   re  rf  rg  r-  N)rh  r   r   r>  r   Z+load_tpu_embedding_proximal_yogi_parametersrZ   )rj  r-  rk  rl  r  r  rn  r  r/   r0   r.  	  r  zB_ProximalYogiHandler.create_variables_and_ops.<locals>.load_ops_fnc            
   
      s   g }  }t tD ]A\}}}}t|' tj||d\}}}tt	||t	||t	||}	W d   n1 sBw   Y  d}| 
|	 q| S )zReturns the retrieve ops for Proximal Yogi embedding tables.

      Returns:
        A list of ops to retrieve embedding and slot variables from TPU to CPU.
      rs  N)rh  r   r   r>  r   Z/retrieve_tpu_embedding_proximal_yogi_parametersr	   rt  r   ru  rZ   )
rw  r-  rk  rl  r  r  rx  r  r  rz  r  r/   r0   r/  
  r  zF_ProximalYogiHandler.create_variables_and_ops.<locals>.retrieve_ops_fn)r
   r|  r   r   r&  rT   r%   r&   r   r'  r(  r  r`   rv   )r   r  r+  r   r  r,  r  r  r  r  r.  r/  r/   r  r0   r*  	  s0   

z-_ProximalYogiHandler.create_variables_and_opsNr  r/   r/   r/   r0   r  	  s
    r  c                   @   rX  )	_MomentumHandlerz Handles Momentum specific logic.c                 C   s,   |j j  | jj|j j_| jj|j j_d S rJ   )r,   r   r[  r   r   rV  r/   r/   r0   r  -
  r  z,_MomentumHandler.set_optimization_parametersc                 C   r]  )Nr^  ZMomentum)rc   r   rW  r/   r/   r0   r$  4
  r\  z0_MomentumHandler.get_default_slot_variable_namesc                    sb   t  }t|j|j|jtjjg|dt	} fdd}	 fdd}
||	|
fS )Nr`  c               
      v   g }  }t tD ],\}}}t| tj||||d}W d   n1 s,w   Y  d}| | q| S )zReturns the retrieve ops for Momentum embedding tables.

      Returns:
        A list of ops to load embedding and slot variables from CPU to TPU.
      )r  rd   re  rf  rg  r-  N)rh  r   r   r>  r   Z&load_tpu_embedding_momentum_parametersrZ   )rj  r-  rk  rl  r  rn  r  r  r   r  r,  r/   r0   r.  D
  s$   
	z>_MomentumHandler.create_variables_and_ops.<locals>.load_ops_fnc               	         g }  }t tD ]:\}}}t|! tj||d\}}tt	||t	||}W d   n1 s:w   Y  d}| 
| q| S )zReturns the retrieve ops for Momentum embedding tables.

      Returns:
        A list of ops to retrieve embedding and slot variables from TPU to CPU.
      rs  N)rh  r   r   r>  r   Z*retrieve_tpu_embedding_momentum_parametersr	   rt  r   ru  rZ   )rw  r-  rk  rl  r  rx  r  rz  r  r/   r0   r/  [
  *   


zB_MomentumHandler.create_variables_and_ops.<locals>.retrieve_ops_fn)
r
   r  r&  rd   r%   r&   r   r'  r(  rp   )r   r  r+  r   r  r,  r  r  r  r.  r/  r/   r  r0   r*  7
  s   
z)_MomentumHandler.create_variables_and_opsNr  r/   r/   r/   r0   r  *
  s
    r  c                   @   rX  )	_RMSPropHandlerz Handles RMS prop specific logic.c                 C   s:   |j j  | jj|j j_| jj|j j_| jj|j j_d S rJ   )r,   Zrms_propr[  r   r   r   r   rV  r/   r/   r0   r  z
  s   z+_RMSPropHandler.set_optimization_parametersc                 C   r  )Nz{}/{}/msZRMSPropz	{}/{}/mom)rf   r   rW  r/   r/   r0   r$  
  r  z/_RMSPropHandler.get_default_slot_variable_namesc           
         s   t |j|j|jtjjgt dt |j	|j|jtjjgt dt
} fdd} fdd}	|||	fS )Nr`  c                     s|   g }  }t tD ].\}}}}t| tj|||||d}W d   n1 s/w   Y  d}| | q| S )zReturns the retrieve ops for RMS Prop embedding tables.

      Returns:
        A list of ops to load embedding and slot variables from CPU to TPU.
      )r  rg   rh   re  rf  rg  r-  N)rh  r   r   r>  r   Z&load_tpu_embedding_rms_prop_parametersrZ   )rj  r-  rk  rl  ms_variablemom_variablern  r  Zmom_variablesZms_variablesr   r  r,  r/   r0   r.  
  s&   
z=_RMSPropHandler.create_variables_and_ops.<locals>.load_ops_fnc            
   
      s   g }  }t tD ]A\}}}}t|' tj||d\}}}tt	||t	||t	||}	W d   n1 sBw   Y  d}| 
|	 q| S )zReturns the retrieve ops for RMS Prop embedding tables.

      Returns:
        A list of ops to retrieve embedding and slot variables from TPU to CPU.
      rs  N)rh  r   r   r>  r   Z*retrieve_tpu_embedding_rms_prop_parametersr	   rt  r   ru  rZ   )
rw  r-  rk  rl  r  r  rx  Zretrieved_msZretrieved_momrz  r  r/   r0   r/  
  s,   


zA_RMSPropHandler.create_variables_and_ops.<locals>.retrieve_ops_fn)r&  rg   r%   r&   r   r'  r(  r
   r  rh   rr   
r   r  r+  r   r  r,  r  r  r.  r/  r/   r  r0   r*  
  s(   

z(_RMSPropHandler.create_variables_and_opsNr  r/   r/   r/   r0   r  w
  s
    	r  c                   @   rX  )	_FrequencyEstimatorHandlerz+Handles frequency estimator specific logic.c                 C   s@   |j j  |j j}| jj|_| jj|_| jj|_| jj|_d S rJ   )r,   Zfrequency_estimatorr[  r   r   r   r   r   )r   r  freqr/   r/   r0   r  
  s   


z6_FrequencyEstimatorHandler.set_optimization_parametersc                 C   s   t d|S )Nz{}/FrequencyEstimator)rm   r   rW  r/   r/   r0   r$  
  s   z:_FrequencyEstimatorHandler.get_default_slot_variable_namesc           
         sx   |j dkrtd|j t|j|j|j tjjgt	
 dt} fdd} fdd}	|||	fS )Nr   zRFrequencyEstimator tables should only have a dimension of 1. Received dimension {}r`  c               
      r  )zReturns the retrieve ops for Frequency Estimator embedding tables.

      Returns:
        A list of ops to load embedding and slot variables from CPU to TPU.
      )r  rn   re  rf  rg  r-  N)rh  r   r   r>  r   Z1load_tpu_embedding_frequency_estimator_parametersrZ   )rj  r-  rk  rl  last_hit_step_variablern  r  Zlast_hit_step_variablesr   r  r,  r/   r0   r.  
  rq  zH_FrequencyEstimatorHandler.create_variables_and_ops.<locals>.load_ops_fnc               	      r  )zReturns the retrieve ops for Frequency Estimator embedding tables.

      Returns:
        A list of ops to retrieve embedding and slot variables from TPU to CPU.
      rs  N)rh  r   r   r>  r   Z5retrieve_tpu_embedding_frequency_estimator_parametersr	   rt  r   ru  rZ   )rw  r-  rk  rl  r  rx  Zretrieved_last_hit_steprz  r  r/   r0   r/    r  zL_FrequencyEstimatorHandler.create_variables_and_ops.<locals>.retrieve_ops_fn)r&   r   r   r&  rn   r%   r   r'  r(  r
   r  rw   r  r/   r  r0   r*  
  s$   

z3_FrequencyEstimatorHandler.create_variables_and_opsNr  r/   r/   r/   r0   r  
  s
    r  c                   @   rX  )	!_StochasticGradientDescentHandlerz3Handles stochastic gradient descent specific logic.c                 C   rZ  rJ   )r,   Zstochastic_gradient_descentr[  rV  r/   r/   r0   r  '  s   
z=_StochasticGradientDescentHandler.set_optimization_parametersc                 C   s   d S rJ   r/   rW  r/   r/   r0   r$  +  s   zA_StochasticGradientDescentHandler.get_default_slot_variable_namesc           	         s0   ~ fdd} fdd}d ||fS )Nc               	      sj   g }  }t D ]*\}}t| tj|||d}W d   n1 s&w   Y  d}| | q| S )rb  )r  re  rf  rg  r-  N)r   r   r>  r   Z9load_tpu_embedding_stochastic_gradient_descent_parametersrZ   )rj  r-  rk  rl  rn  r  r   r  r,  r/   r0   r.  2  s    zO_StochasticGradientDescentHandler.create_variables_and_ops.<locals>.load_ops_fnc               	      sz   g }  }t D ]2\}}t| tj||d}tt||}W d   n1 s.w   Y  d}| 	| q| S )zReturns the retrieve ops for SGD embedding tables.

      Returns:
        A list of ops to retrieve embedding and slot variables from TPU to CPU.
      rs  N)
r   r   r>  r   Z=retrieve_tpu_embedding_stochastic_gradient_descent_parametersr	   rt  r   ru  rZ   )rw  r-  rk  rl  rx  rz  r  r/   r0   r/  G  s&   

zS_StochasticGradientDescentHandler.create_variables_and_ops.<locals>.retrieve_ops_fnr/   )	r   r  r+  r   r  r,  r  r.  r/  r/   r  r0   r*  .  s   
z:_StochasticGradientDescentHandler.create_variables_and_opsNr  r/   r/   r/   r0   r  $  s
    r  c                 C   s   t | tr	t| S t | trt| S t | trt| S t | tr$t| S t | t	r-t
| S t | tr6t| S t | tr?t| S t | trHt| S t | trQt| S t | trZt| S t S )z7Gets the optimization handler given the parameter type.)r   r   rY  r   r  r   r  r   r  r   r  r   r  r   r  r   r  r   r  r   r  rU  )r,   r/   r/   r0   rN  a  s*   









rN  c                    s   t  fddt D S )z Create an OrderedDict from Dict.c                 3   s    | ]	}| | fV  qd S rJ   r/   )rR   rS   dr/   r0   rU   |  s    z'_create_ordered_dict.<locals>.<genexpr>)rW   rX   sortedr  r/   r  r0   r   z  s   r   c                 C   s8   g }| D ]}| | j pd}||gt||   q|S )z9Create a per feature list of combiners, ordered by table.r   )r(   r  r  )r   r  r=  r  r(   r/   r/   r0   r     s
   r   c                 C   sh   i }|   D ]\}}|j|v r||j | q|g||j< qt }t|D ]
}t|| ||< q'|S )z4Create mapping from table to a list of its features.)rY   r8   rZ   rW   rX   r  )r   Ztable_to_features_dict_tmpr  Zfeature_configr  r  r/   r/   r0   r     s   
r   c                    s    fdd}|S )z?Create device_fn() to use with _create_partitioned_variables().c                    sp   t d| j}t d| j}|s|std| j|r$t|d}nt|d} | }td| | |S )zReturns the `device` for `op`.z.*/part_(\d+)(/|$)z.*dummy_(\d+).*z9Internal Error: Expected {} to contain /part_* or dummy_*r   zassigning {} to {}.)	rematchr   rL  r   r   rt  r;  debug)opZ
part_matchZdummy_matchidxr   r   r/   r0   	device_fn  s   z$_create_device_fn.<locals>.device_fnr/   )r   r  r/   r  r0   r%    s   r%  c           	      C   s   t ||}ttj| ||ft|tj||dd}||kr|S t|| D ]}|	tjd
|| | d|ftj|tjjgdd q%|S )z>Creates PartitionedVariables based on `num_hosts` for `table`.F)rD  ZpartitionerrB  r'   rW   	trainablezdummy_{}_{}r   )rD  rB  r'   rW   r  )minr   r   Zget_variabler   Zfixed_size_partitionerr   rC  r   rZ   r   r   r'  ZLOCAL_VARIABLES)	r   r   r%   r  r'   rW   Z
num_slicesZvar_listr  r/   r/   r0   r&    s4   

	r&  rJ   )ar5   rW   r   r   r  typingr   Ztensorflow.core.protobuf.tpur   r   r	  Ztensorflow.python.eagerr   Ztensorflow.python.frameworkr   r   Ztensorflow.python.opsr   r	   r
   r   r   r   r   Ztensorflow.python.platformr   r;  Ztensorflow.python.tpur   r   Ztensorflow.python.tpu.opsr   Z tensorflow.python.util.tf_exportr   r
  r   r   
namedtupler   r7   r<   rL   r]   r^   r_   ra   rc   re   rf   ri   rj   rl   rm   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rQ  rY  r  r  r  r  r  r  r  r  r  rN  r   r   r   r%  r&  r/   r/   r/   r0   <module>   s$  

e

*)
DQ
I
Zq^K
F;:      HfM]d`MZS=	