B
    ӻdQ                 @   s   d Z ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ G dd deZedG dd deZedG dd deZdd Zdd ZdS )zAttention layers that can be used in sequence DNN/CNN models.

This file follows the terminology of https://arxiv.org/abs/1706.03762 Figure 2.
Attention is formed by three tensors: Query, Key and Value.
    )dtypes)ops)tensor_shape)backend)Layer)control_flow_util)	array_ops)init_ops)math_ops)nn)keras_exportc                   s\   e Zd ZdZd fdd	Zdd Zdd	d
ZdddZdddZdd Z	 fddZ
  ZS )BaseDenseAttentiona  Base Attention class for Dense networks.

  This class is suitable for Dense or CNN networks, and not for RNN networks.

  Implementations of attention mechanisms should inherit from this class, and
  reuse the `apply_attention_scores()` method.

  Args:
    causal: Boolean. Set to `True` for decoder self-attention. Adds a mask such
      that position `i` cannot attend to positions `j > i`. This prevents the
      flow of information from the future towards the past.
    dropout: Float between 0 and 1. Fraction of the units to drop for the
      attention scores.

  Call Args:

    inputs: List of the following tensors:
      * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
      * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
      * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`. If not
        given, will use `value` for both `key` and `value`, which is the
        most common case.
    mask: List of the following tensors:
      * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
        If given, the output will be zero at the positions where
        `mask==False`.
      * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
        If given, will apply the mask such that values at positions where
        `mask==False` do not contribute to the result.
    training: Python boolean indicating whether the layer should behave in
      training mode (adding dropout) or in inference mode (no dropout).
    return_attention_scores: bool, it `True`, returns the attention scores
      (after masking and softmax) as an additional output argument.

  Output:

    Attention outputs of shape `[batch_size, Tq, dim]`.
    [Optional] Attention scores after masking and softmax with shape
      `[batch_size, Tq, Tv]`.
  F        c                s(   t t| jf | || _|| _d| _d S )NT)superr   __init__causaldropoutZsupports_masking)selfr   r   kwargs)	__class__ `/var/www/html/venv/lib/python3.7/site-packages/tensorflow/python/keras/layers/dense_attention.pyr   L   s    zBaseDenseAttention.__init__c             C   s   t S )zCalculates attention scores.

    Args:
      query: Query tensor of shape `[batch_size, Tq, dim]`.
      key: Key tensor of shape `[batch_size, Tv, dim]`.

    Returns:
      Tensor of shape `[batch_size, Tq, Tv]`.
    )NotImplementedError)r   querykeyr   r   r   _calculate_scoresS   s    
z$BaseDenseAttention._calculate_scoresNc                s   |dk	rPt |}|jtjkr8|dt j||jd 8 }n|dt j||jd 8 }|dkr`t }t	| fdd}t
||fddt |fS )	a  Applies attention scores to the given value tensor.

    To use this method in your attention layer, follow the steps:

    * Use `query` tensor of shape `[batch_size, Tq]` and `key` tensor of shape
      `[batch_size, Tv]` to calculate the attention `scores`.
    * Pass `scores` and `value` tensors to this method. The method applies
      `scores_mask`, calculates `attention_distribution = softmax(scores)`, then
      returns `matmul(attention_distribution, value).
    * Apply `query_mask` and return the result.

    Args:
      scores: Scores float tensor of shape `[batch_size, Tq, Tv]`.
      value: Value tensor of shape `[batch_size, Tv, dim]`.
      scores_mask: A boolean mask `Tensor` of shape `[batch_size, 1, Tv]` or
        `[batch_size, Tq, Tv]`. If given, scores at positions where
        `scores_mask==False` do not contribute to the result. It must contain
        at least one `True` value in each line along the last dimension.
      training: Python boolean indicating whether the layer should behave in
        training mode (adding dropout) or in inference mode (no dropout).

    Returns:
      Tensor of shape `[batch_size, Tq, dim]`.
      Attention scores after masking and softmax with shape
        `[batch_size, Tq, Tv]`.
    Ng     @)dtypeg    eAc                  s   t j jdS )N)Zrate)r   r   r   )r   weightsr   r   dropped_weights   s    z9BaseDenseAttention._apply_scores.<locals>.dropped_weightsc                  s
   t  S )N)r   identityr   )r   r   r   <lambda>       z2BaseDenseAttention._apply_scores.<locals>.<lambda>)r
   Zlogical_notr   r   Zfloat16castr   Zlearning_phaser   Zsoftmaxr   Z
smart_condmatmul)r   scoresvaluescores_masktrainingZpadding_maskr   r   )r   r   r   _apply_scores_   s    

z BaseDenseAttention._apply_scoresc             C   s  | j ||d |d }|d }t|dkr2|d n|}|rB|d nd }|rR|d nd }	| j||d}
|	d k	rztj|	dd}	| jrt|
}tjt|d d |dd  gdd}t	|}nd }t
|	|}| j|
|||d\}}|d k	rtj|d	d}|tj||jd
9 }|r||fS |S )N)inputsmaskr         )r   r   )axis)r$   r%   r&   r'   )r   )_validate_call_argslenr   r   expand_dimsr   shapeconcatZ	ones_like_lower_triangular_mask_merge_masksr(   r
   r"   r   )r   r)   r*   r'   Zreturn_attention_scoresqvkq_maskZv_maskr$   Zscores_shapeZcausal_mask_shapeZcausal_maskr&   resultZattention_scoresr   r   r   call   s2    



zBaseDenseAttention.callc             C   s4   | j ||d |r0|d }|d kr&d S t|S d S )N)r)   r*   r   )r0   r   Z"convert_to_tensor_v2_with_dispatch)r   r)   r*   r:   r   r   r   compute_mask   s    
zBaseDenseAttention.compute_maskc             C   s   | j j}t|ts td|t|dk s8t|dkrLtd|t||rt|tshtd|t|dk st|t|krtd|t|dS )z'Validates arguments of the call method.zZ{} layer must be called on a list of inputs, namely [query, value] or [query, value, key].r,      zm{} layer accepts inputs list of length 2 or 3, namely [query, value] or [query, value, key]. Given length: {}z>{} layer mask must be a list, namely [query_mask, value_mask].z[{} layer mask must be a list of length 2, namely [query_mask, value_mask]. Given length: {}N)r   __name__
isinstancelist
ValueErrorformatr1   )r   r)   r*   
class_namer   r   r   r0      s$    

z&BaseDenseAttention._validate_call_argsc                s8   | j | jd}tt|  }tt| t|  S )N)r   r   )r   r   r   r   
get_configdictrA   items)r   configbase_config)r   r   r   rE      s    
zBaseDenseAttention.get_config)Fr   )NN)NNF)N)r?   
__module____qualname____doc__r   r   r(   r<   r=   r0   rE   __classcell__r   r   )r   r   r   "   s   (
1  
"
	r   zkeras.layers.Attentionc                   sB   e Zd ZdZd fdd	Z fddZdd Z fd	d
Z  ZS )	Attentiona  Dot-product attention layer, a.k.a. Luong-style attention.

  Inputs are `query` tensor of shape `[batch_size, Tq, dim]`, `value` tensor of
  shape `[batch_size, Tv, dim]` and `key` tensor of shape
  `[batch_size, Tv, dim]`. The calculation follows the steps:

  1. Calculate scores with shape `[batch_size, Tq, Tv]` as a `query`-`key` dot
     product: `scores = tf.matmul(query, key, transpose_b=True)`.
  2. Use scores to calculate a distribution with shape
     `[batch_size, Tq, Tv]`: `distribution = tf.nn.softmax(scores)`.
  3. Use `distribution` to create a linear combination of `value` with
     shape `[batch_size, Tq, dim]`:
     `return tf.matmul(distribution, value)`.

  Args:
    use_scale: If `True`, will create a scalar variable to scale the attention
      scores.
    causal: Boolean. Set to `True` for decoder self-attention. Adds a mask such
      that position `i` cannot attend to positions `j > i`. This prevents the
      flow of information from the future towards the past.
    dropout: Float between 0 and 1. Fraction of the units to drop for the
      attention scores.

  Call Args:

    inputs: List of the following tensors:
      * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
      * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
      * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`. If not
        given, will use `value` for both `key` and `value`, which is the
        most common case.
    mask: List of the following tensors:
      * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
        If given, the output will be zero at the positions where
        `mask==False`.
      * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
        If given, will apply the mask such that values at positions where
        `mask==False` do not contribute to the result.
    return_attention_scores: bool, it `True`, returns the attention scores
      (after masking and softmax) as an additional output argument.
    training: Python boolean indicating whether the layer should behave in
      training mode (adding dropout) or in inference mode (no dropout).

  Output:

    Attention outputs of shape `[batch_size, Tq, dim]`.
    [Optional] Attention scores after masking and softmax with shape
      `[batch_size, Tq, Tv]`.

  The meaning of `query`, `value` and `key` depend on the application. In the
  case of text similarity, for example, `query` is the sequence embeddings of
  the first piece of text and `value` is the sequence embeddings of the second
  piece of text. `key` is usually the same tensor as `value`.

  Here is a code example for using `Attention` in a CNN+Attention network:

  ```python
  # Variable-length int sequences.
  query_input = tf.keras.Input(shape=(None,), dtype='int32')
  value_input = tf.keras.Input(shape=(None,), dtype='int32')

  # Embedding lookup.
  token_embedding = tf.keras.layers.Embedding(input_dim=1000, output_dim=64)
  # Query embeddings of shape [batch_size, Tq, dimension].
  query_embeddings = token_embedding(query_input)
  # Value embeddings of shape [batch_size, Tv, dimension].
  value_embeddings = token_embedding(value_input)

  # CNN layer.
  cnn_layer = tf.keras.layers.Conv1D(
      filters=100,
      kernel_size=4,
      # Use 'same' padding so outputs have the same shape as inputs.
      padding='same')
  # Query encoding of shape [batch_size, Tq, filters].
  query_seq_encoding = cnn_layer(query_embeddings)
  # Value encoding of shape [batch_size, Tv, filters].
  value_seq_encoding = cnn_layer(value_embeddings)

  # Query-value attention of shape [batch_size, Tq, filters].
  query_value_attention_seq = tf.keras.layers.Attention()(
      [query_seq_encoding, value_seq_encoding])

  # Reduce over the sequence axis to produce encodings of shape
  # [batch_size, filters].
  query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
      query_seq_encoding)
  query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
      query_value_attention_seq)

  # Concatenate query and document encodings to produce a DNN input layer.
  input_layer = tf.keras.layers.Concatenate()(
      [query_encoding, query_value_attention])

  # Add DNN layers, and create Model.
  # ...
  ```
  Fc                s   t t| jf | || _d S )N)r   rN   r   	use_scale)r   rO   r   )r   r   r   r   A  s    zAttention.__init__c                s>   | j r$| jddt | jdd| _nd| _tt| | dS )z*Creates scale variable if use_scale==True.scaler   T)namer3   initializerr   	trainableN)	rO   
add_weightr	   Zones_initializerr   rP   r   rN   build)r   input_shape)r   r   r   rU   E  s    zAttention.buildc             C   s(   t j||dd}| jdk	r$|| j9 }|S )zCalculates attention scores as a query-key dot product.

    Args:
      query: Query tensor of shape `[batch_size, Tq, dim]`.
      key: Key tensor of shape `[batch_size, Tv, dim]`.
    Returns:
      Tensor of shape `[batch_size, Tq, Tv]`.
    T)Ztranspose_bN)r
   r#   rP   )r   r   r   r$   r   r   r   r   R  s    	

zAttention._calculate_scoresc                s4   d| j i}tt|  }tt| t|  S )NrO   )rO   r   rN   rE   rF   rA   rG   )r   rH   rI   )r   r   r   rE   `  s    
zAttention.get_config)F)	r?   rJ   rK   rL   r   rU   r   rE   rM   r   r   )r   r   rN      s
   crN   zkeras.layers.AdditiveAttentionc                   sB   e Zd ZdZd fdd	Z fddZdd Z fd	d
Z  ZS )AdditiveAttentiona  Additive attention layer, a.k.a. Bahdanau-style attention.

  Inputs are `query` tensor of shape `[batch_size, Tq, dim]`, `value` tensor of
  shape `[batch_size, Tv, dim]` and `key` tensor of shape
  `[batch_size, Tv, dim]`. The calculation follows the steps:

  1. Reshape `query` and `value` into shapes `[batch_size, Tq, 1, dim]`
     and `[batch_size, 1, Tv, dim]` respectively.
  2. Calculate scores with shape `[batch_size, Tq, Tv]` as a non-linear
     sum: `scores = tf.reduce_sum(tf.tanh(query + value), axis=-1)`
  3. Use scores to calculate a distribution with shape
     `[batch_size, Tq, Tv]`: `distribution = tf.nn.softmax(scores)`.
  4. Use `distribution` to create a linear combination of `value` with
     shape `[batch_size, Tq, dim]`:
     `return tf.matmul(distribution, value)`.

  Args:
    use_scale: If `True`, will create a variable to scale the attention scores.
    causal: Boolean. Set to `True` for decoder self-attention. Adds a mask such
      that position `i` cannot attend to positions `j > i`. This prevents the
      flow of information from the future towards the past.
    dropout: Float between 0 and 1. Fraction of the units to drop for the
      attention scores.

  Call Args:

    inputs: List of the following tensors:
      * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
      * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
      * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`. If not
        given, will use `value` for both `key` and `value`, which is the
        most common case.
    mask: List of the following tensors:
      * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
        If given, the output will be zero at the positions where
        `mask==False`.
      * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
        If given, will apply the mask such that values at positions where
        `mask==False` do not contribute to the result.
    training: Python boolean indicating whether the layer should behave in
      training mode (adding dropout) or in inference mode (no dropout).
    return_attention_scores: bool, it `True`, returns the attention scores
      (after masking and softmax) as an additional output argument.

  Output:

    Attention outputs of shape `[batch_size, Tq, dim]`.
    [Optional] Attention scores after masking and softmax with shape
      `[batch_size, Tq, Tv]`.

  The meaning of `query`, `value` and `key` depend on the application. In the
  case of text similarity, for example, `query` is the sequence embeddings of
  the first piece of text and `value` is the sequence embeddings of the second
  piece of text. `key` is usually the same tensor as `value`.

  Here is a code example for using `AdditiveAttention` in a CNN+Attention
  network:

  ```python
  # Variable-length int sequences.
  query_input = tf.keras.Input(shape=(None,), dtype='int32')
  value_input = tf.keras.Input(shape=(None,), dtype='int32')

  # Embedding lookup.
  token_embedding = tf.keras.layers.Embedding(max_tokens, dimension)
  # Query embeddings of shape [batch_size, Tq, dimension].
  query_embeddings = token_embedding(query_input)
  # Value embeddings of shape [batch_size, Tv, dimension].
  value_embeddings = token_embedding(value_input)

  # CNN layer.
  cnn_layer = tf.keras.layers.Conv1D(
      filters=100,
      kernel_size=4,
      # Use 'same' padding so outputs have the same shape as inputs.
      padding='same')
  # Query encoding of shape [batch_size, Tq, filters].
  query_seq_encoding = cnn_layer(query_embeddings)
  # Value encoding of shape [batch_size, Tv, filters].
  value_seq_encoding = cnn_layer(value_embeddings)

  # Query-value attention of shape [batch_size, Tq, filters].
  query_value_attention_seq = tf.keras.layers.AdditiveAttention()(
      [query_seq_encoding, value_seq_encoding])

  # Reduce over the sequence axis to produce encodings of shape
  # [batch_size, filters].
  query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
      query_seq_encoding)
  query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
      query_value_attention_seq)

  # Concatenate query and document encodings to produce a DNN input layer.
  input_layer = tf.keras.layers.Concatenate()(
      [query_encoding, query_value_attention])

  # Add DNN layers, and create Model.
  # ...
  ```
  Tc                s   t t| jf | || _d S )N)r   rW   r   rO   )r   rO   r   )r   r   r   r     s    zAdditiveAttention.__init__c                sh   t |d }|d }t|t jr(|j}| jrN| jd|gt | j	dd| _
nd | _
tt| | d S )Nr+   r/   rP   T)rQ   r3   rR   r   rS   )r   ZTensorShaper@   Z	Dimensionr%   rO   rT   r	   Zglorot_uniform_initializerr   rP   r   rW   rU   )r   rV   Zv_shapedim)r   r   r   rU     s    zAdditiveAttention.buildc             C   sJ   t j|dd}t j|dd}| jr*| j}nd}tj|t||  ddS )zCalculates attention scores as a nonlinear sum of query and key.

    Args:
      query: Query tensor of shape `[batch_size, Tq, dim]`.
      key: Key tensor of shape `[batch_size, Tv, dim]`.
    Returns:
      Tensor of shape `[batch_size, Tq, Tv]`.
    r-   )r.   g      ?r/   )r   r2   rO   rP   r
   Z
reduce_sumtanh)r   r   r   Z
q_reshapedZ
k_reshapedrP   r   r   r   r     s    z#AdditiveAttention._calculate_scoresc                s4   d| j i}tt|  }tt| t|  S )NrO   )rO   r   rW   rE   rF   rA   rG   )r   rH   rI   )r   r   r   rE     s    
zAdditiveAttention.get_config)T)	r?   rJ   rK   rL   r   rU   r   rE   rM   r   r   )r   r   rW   f  s
   erW   c             C   s@   t jtj| tjddd}t jtj| tjddd}t ||S )zCCreates a lower-triangular boolean mask over the last 2 dimensions.)r3   r   r-   )r.   r/   )r
   Zcumsumr   Zonesr   Zint32Zgreater_equal)r3   Z	row_indexZ	col_indexr   r   r   r5     s
    r5   c             C   s$   | d kr|S |d kr| S t | |S )N)r
   logical_and)xyr   r   r   r6     s
    r6   N)rL   Ztensorflow.python.frameworkr   r   r   Ztensorflow.python.kerasr   Z)tensorflow.python.keras.engine.base_layerr   Ztensorflow.python.keras.utilsr   Ztensorflow.python.opsr   r	   r
   r   Z tensorflow.python.util.tf_exportr   r   rN   rW   r5   r6   r   r   r   r   <module>   s*    ; 
 	