B
    d1t                 @   s   d Z ddlZddlZddlZddlZddlm  mZ	 ddl
mZ ddl
mZ ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ejZdd Zdd Zdd ZedG dd deZ dS )z'Keras-based multi-head attention layer.    N)constraints)initializers)regularizers)Layer)
activation)core)regularization)tf_utils)
tf_logging)keras_exportc       	         s   t d|  ttt| || d f }| }d xFt| D ]:}||ksT|| d krb | 7  q< t | 7  |d7 }q<W dfdd|D fdd|D   fdd|D  }d |f }t|}d| f }|||fS )	a  Builds einsum equations for the attention computation.

    Query, key, value inputs after projection are expected to have the shape as:
    `(bs, <non-attention dims>, <attention dims>, num_heads, channels)`.
    `bs` and `<non-attention dims>` are treated as `<batch dims>`.

    The attention operations can be generalized:
    (1) Query-key dot product:
    `(<batch dims>, <query attention dims>, num_heads, channels), (<batch dims>,
    <key attention dims>, num_heads, channels) -> (<batch dims>,
    num_heads, <query attention dims>, <key attention dims>)`
    (2) Combination:
    `(<batch dims>, num_heads, <query attention dims>, <key attention dims>),
    (<batch dims>, <value attention dims>, num_heads, channels) -> (<batch
    dims>, <query attention dims>, num_heads, channels)`

    Args:
      rank: Rank of query, key, value tensors.
      attn_axes: List/tuple of axes, `[-1, rank)`,
        that attention will be applied to.

    Returns:
      Einsum equations.
    N    c                s   g | ]} | qS  r   ).0i)target_notationr   ]/var/www/html/venv/lib/python3.7/site-packages/keras/layers/attention/multi_head_attention.py
<listcomp>O   s    z-_build_attention_equation.<locals>.<listcomp>c                s   g | ]} | qS r   r   )r   r   )r   r   r   r   P   s    c                s   g | ]} | qS r   r   )r   r   )source_notationr   r   r   Q   s    z	%s,%s->%s)_CHR_IDXtuplenpdeleterangejoinlen)	rank	attn_axesZ
batch_dimsletter_offsetr   Zproduct_notationZdot_product_equationattn_scores_rankZcombine_equationr   )r   r   r   _build_attention_equation)   s*    8r    c             C   s   d}d}d}d}d}x,t | D ] }t||  }	||	7 }||	7 }qW || 7 }x,t |D ] }t||  }	||	7 }||	7 }qTW ||7 }x4t |D ](}t||  }	||	7 }||	7 }||	7 }qW | d| d| }
|
|t|fS )zFBuilds an einsum equation for projections inside multi-head attention.r   r   ,z->)r   r   r   )	free_dims
bound_dimsoutput_dimsZ	input_strZ
kernel_strZ
output_str	bias_axesr   r   charZequationr   r   r   _build_proj_equationa   s,    r'   c             C   s   d g| t |  t| S )N)r   list)output_rankZknown_last_dimsr   r   r   _get_output_shape~   s    r*   zkeras.layers.MultiHeadAttentionc                   s   e Zd ZdZd" fdd	Z fd	d
Zedd Zd#ddZdd Z	d$ddZ
dd Zd%ddZd&ddZd'ddZd(ddZd)ddZd*d d!Z  ZS )+MultiHeadAttentionaF  MultiHeadAttention layer.

    This is an implementation of multi-headed attention as described in the
    paper "Attention is all you Need" (Vaswani et al., 2017).
    If `query`, `key,` `value` are the same, then
    this is self-attention. Each timestep in `query` attends to the
    corresponding sequence in `key`, and returns a fixed-width vector.

    This layer first projects `query`, `key` and `value`. These are
    (effectively) a list of tensors of length `num_attention_heads`, where the
    corresponding shapes are `(batch_size, <query dimensions>, key_dim)`,
    `(batch_size, <key/value dimensions>, key_dim)`,
    `(batch_size, <key/value dimensions>, value_dim)`.

    Then, the query and key tensors are dot-producted and scaled. These are
    softmaxed to obtain attention probabilities. The value tensors are then
    interpolated by these probabilities, then concatenated back to a single
    tensor.

    Finally, the result tensor with the last dimension as value_dim can take an
    linear projection and return.

    When using `MultiHeadAttention` inside a custom layer, the custom layer must
    implement its own `build()` method and call `MultiHeadAttention`'s
    `_build_from_signature()` there.
    This enables weights to be restored correctly when the model is loaded.

    Examples:

    Performs 1D cross-attention over two sequence inputs with an attention mask.
    Returns the additional attention weights over heads.

    >>> layer = MultiHeadAttention(num_heads=2, key_dim=2)
    >>> target = tf.keras.Input(shape=[8, 16])
    >>> source = tf.keras.Input(shape=[4, 16])
    >>> output_tensor, weights = layer(target, source,
    ...                                return_attention_scores=True)
    >>> print(output_tensor.shape)
    (None, 8, 16)
    >>> print(weights.shape)
    (None, 2, 8, 4)

    Performs 2D self-attention over a 5D input tensor on axes 2 and 3.

    >>> layer = MultiHeadAttention(
    ...     num_heads=2, key_dim=2, attention_axes=(2, 3))
    >>> input_tensor = tf.keras.Input(shape=[5, 3, 4, 16])
    >>> output_tensor = layer(input_tensor, input_tensor)
    >>> print(output_tensor.shape)
    (None, 5, 3, 4, 16)

    Args:
      num_heads: Number of attention heads.
      key_dim: Size of each attention head for query and key.
      value_dim: Size of each attention head for value.
      dropout: Dropout probability.
      use_bias: Boolean, whether the dense layers use bias vectors/matrices.
      output_shape: The expected shape of an output tensor, besides the batch
        and sequence dims. If not specified, projects back to the key feature
        dim.
      attention_axes: axes over which the attention is applied. `None` means
        attention over all axes, but batch, heads, and features.
      kernel_initializer: Initializer for dense layer kernels.
      bias_initializer: Initializer for dense layer biases.
      kernel_regularizer: Regularizer for dense layer kernels.
      bias_regularizer: Regularizer for dense layer biases.
      activity_regularizer: Regularizer for dense layer activity.
      kernel_constraint: Constraint for dense layer kernels.
      bias_constraint: Constraint for dense layer kernels.

    Call arguments:
      query: Query `Tensor` of shape `(B, T, dim)`.
      value: Value `Tensor` of shape `(B, S, dim)`.
      key: Optional key `Tensor` of shape `(B, S, dim)`. If not given, will use
        `value` for both `key` and `value`, which is the most common case.
      attention_mask: a boolean mask of shape `(B, T, S)`, that prevents
        attention to certain positions. The boolean mask specifies which query
        elements can attend to which key elements, 1 indicates attention and 0
        indicates no attention. Broadcasting can happen for the missing batch
        dimensions and the head dimension.
      return_attention_scores: A boolean to indicate whether the output should
        be `(attention_output, attention_scores)` if `True`, or
        `attention_output` if `False`. Defaults to `False`.
      training: Python boolean indicating whether the layer should behave in
        training mode (adding dropout) or in inference mode (no dropout).
        Defaults to either using the training mode of the parent layer/model,
        or False (inference) if there is no parent layer.
      use_causal_mask: A boolean to indicate whether to apply a causal mask to
        prevent tokens from attending to future tokens (e.g., used in a decoder
        Transformer).

    Returns:
      attention_output: The result of the computation, of shape `(B, T, E)`,
        where `T` is for target sequence shapes and `E` is the query input last
        dimension if `output_shape` is `None`. Otherwise, the multi-head outputs
        are projected to the shape specified by `output_shape`.
      attention_scores: [Optional] multi-head attention coefficients over
        attention axes.
    N        Tglorot_uniformzerosc                s   t  jf | d| _|| _|| _|r(|n|| _|| _|| _|| _t	
|| _t	
|	| _t
|
| _t
|| _t
|| _t
|| _t
|| _|d k	rt|tjjs|f| _n|| _d| _d\| _| _| _d S )NTF)NNN)super__init__Zsupports_masking
_num_heads_key_dim
_value_dim_dropout	_use_bias_output_shaper   get_kernel_initializer_bias_initializerr   _kernel_regularizer_bias_regularizer_activity_regularizerr   _kernel_constraint_bias_constraint
isinstancecollectionsabcSized_attention_axes_built_from_signature_query_shape
_key_shape_value_shape)self	num_headskey_dim	value_dimdropoutuse_biasoutput_shapeattention_axeskernel_initializerbias_initializerkernel_regularizerbias_regularizeractivity_regularizerkernel_constraintbias_constraintkwargs)	__class__r   r   r0      s*    

zMultiHeadAttention.__init__c                s   | j | j| j| j| j| j| jt| j	t| j
t| jt| jt| jt| jt| j| j| j| jd}t  }tt| t|  S )N)rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   query_shape	key_shapevalue_shape)r1   r2   r3   r4   r5   r6   rC   r   	serializer8   r9   r   r:   r;   r<   r   r=   r>   rE   rF   rG   r/   
get_configdictr(   items)rH   configZbase_config)rX   r   r   r]     s,    





zMultiHeadAttention.get_configc             C   sZ   | d}| d}| d}| f |}d |||gkrHtdt|  n|||| |S )NrY   rZ   r[   zOne of dimensions of the input shape is missing. It should have been memorized when the layer was serialized. %s is created without weights.)poploggingwarningstr_build_from_signature)clsr`   rY   rZ   r[   layerr   r   r   from_config/  s    



zMultiHeadAttention.from_configc          	   C   s  d| _ t|dr t|j| _nt|| _t|drFt|j| _nt|| _|dkrd| j| _n&t|dr~t|j| _nt|| _t	| 4 | jj
d }t|ddd\}}}tj|ft|d | j| jg| jr|nddd|  | _t| jj
d ddd\}}}tj|ft|d | j| jg| jr8|ndd	d|  | _t| jj
d ddd\}}}tj|ft|d | j| jg| jr|ndd
d|  | _| | | ||  d| _W dQ R X dS )a  Builds layers and variables.

        Once the method is called, self._built_from_signature will be set to
        True.

        Args:
          query: Query tensor or TensorShape.
          value: Value tensor or TensorShape.
          key: Key tensor or TensorShape.
        TshapeNr      )r#   r$   query)rN   r%   namekeyvalueattention_output)rD   hasattrtfTensorShaperi   rE   rG   rF   r	   Zmaybe_init_scoper   r'   r   EinsumDenser*   r1   r2   r5   _get_common_kwargs_for_sublayer_query_dense
_key_denser3   _value_dense_build_attention_make_output_dense_output_dense)rH   rk   rn   rm   r"   einsum_equationr%   r)   r   r   r   re   B  s\    




z(MultiHeadAttention._build_from_signaturec             C   sX   t | j| j| j| j| jd}| jj| j	 }| j
j| j
	 }||d< ||d< |S )N)rR   rS   rT   rU   rV   rP   rQ   )r^   r:   r;   r<   r=   r>   r8   rX   rh   r]   r9   )rH   common_kwargsrP   rQ   r   r   r   rt     s    
z2MultiHeadAttention._get_common_kwargs_for_sublayerc             C   sz   | j r(t| j tjjs | j g}q4| j }n| jd g}t|dt|d\}}}tj	|ft
|d || jrj|nd|d|S )a*  Builds the output projection matrix.

        Args:
          free_dims: Number of free dimensions for einsum equation building.
          common_kwargs: Common keyword arguments for einsum layer.
          name: Name for the projection layer.

        Returns:
          Projection layer.
        rj   )r#   r$   r   N)rN   r%   rl   )r6   r?   r@   rA   rB   rE   r'   r   r   rs   r*   r5   )rH   r"   r|   rl   rN   r{   r%   r)   r   r   r   ry     s    
z%MultiHeadAttention._make_output_densec             C   s~   | j dkr ttd|d | _ nt| j | _ t|| j d\| _| _}tt|t| j  |}tj|d| _	t
j| jd| _dS )a1  Builds multi-head dot-product attention computations.

        This function builds attributes necessary for `_compute_attention` to
        customize attention computation to replace the default dot-product
        attention.

        Args:
          rank: the rank of query, key, value tensors.
        Nr   rj   )r   )axis)Zrate)rC   r   r   r    _dot_product_equation_combine_equationr   r   ZSoftmax_softmaxr   ZDropoutr4   _dropout_layer)rH   r   r   Z	norm_axesr   r   r   rx     s    

z#MultiHeadAttention._build_attentionc             C   sX   |d k	rLt | j d d }x.tt |jt |j D ]}tj||d}q6W | ||S )Nrj   r   )r~   )r   rC   r   ri   rq   Zexpand_dimsr   )rH   attention_scoresattention_maskZmask_expansion_axis_r   r   r   _masked_softmax  s    z"MultiHeadAttention._masked_softmaxc       	      C   s^   t |dtt| j }t | j||}| ||}| j	||d}t | j
||}||fS )a   Applies Dot-product attention with query, key, value tensors.

        This function defines the computation inside `call` with projected
        multi-head Q, K, V inputs. Users can override this function for
        customized attention implementation.

        Args:
          query: Projected query `Tensor` of shape `(B, T, N, key_dim)`.
          key: Projected key `Tensor` of shape `(B, S, N, key_dim)`.
          value: Projected value `Tensor` of shape `(B, S, N, value_dim)`.
          attention_mask: a boolean mask of shape `(B, T, S)`, that prevents
            attention to certain positions. It is generally not needed if the
            `query` and `value` (and/or `key`) are masked.
          training: Python boolean indicating whether the layer should behave in
            training mode (adding dropout) or in inference mode (doing nothing).

        Returns:
          attention_output: Multi-headed outputs of attention computation.
          attention_scores: Multi-headed attention weights.
        g      ?)training)rq   multiplymathsqrtfloatr2   Zeinsumr   r   r   r   )	rH   rk   rm   rn   r   r   r   Zattention_scores_dropoutro   r   r   r   _compute_attention  s    
z%MultiHeadAttention._compute_attentionFc             C   s:  | j |||||d}| js*| j|||d |d kr6|}t|tj}|rV| }	| }t|tj}
t|tj}|
r|rtj	|
 |
 }|j|d}|j|d}n.|
r|jt|d}n|r|jt|d}| |}| |}| |}| |||||\}}| |}|r(tjj||	d}|r6||fS |S )N)rm   r   use_causal_mask)rk   rn   rm   )ri   )lengths)_compute_attention_maskrD   re   r?   rq   ZRaggedTensorZnested_row_lengthsZ	to_tensorr   maximumbounding_shaperi   ru   rv   rw   r   rz   Zfrom_tensor)rH   rk   rn   rm   r   Zreturn_attention_scoresr   r   Zquery_is_raggedZquery_lengthsZkey_is_raggedZvalue_is_raggedr   ro   r   r   r   r   call  sJ    





zMultiHeadAttention.callc             C   s*  t |dd}t |dd}t |dd}d}	|dk	rVt|tj}|ddddtjf }	|dk	rt|tj}|ddtjddf }
|	dkr|
n|	|
@ }	|dk	rt|tj}|ddtjddf }
|	dkr|
n|	|
@ }	|r| ||}
|	dkr|
n|	|
@ }	|	dk	r&|dkr|	nt|t|	@ }|S )a  Computes the attention mask, using the Keras masks of the inputs.

        * The `query`'s mask is reshaped from [B, T] to [B, T, 1].
        * The `value`'s mask is reshaped from [B, S] to [B, 1, S].
        * The `key`'s mask is reshaped from [B, S] to [B, 1, S]. The `key`'s
          mask is ignored if `key` is `None` or if `key is value`.
        * If `use_causal_mask=True`, then the causal mask is computed. Its shape
          is [1, T, S].

        All defined masks are merged using a logical AND operation (`&`).

        In general, if the `query` and `value` are masked, then there is no need
        to define the `attention_mask`.

        Args:
          query: Projected query `Tensor` of shape `(B, T, N, key_dim)`.
          key: Projected key `Tensor` of shape `(B, T, N, key_dim)`.
          value: Projected value `Tensor` of shape `(B, T, N, value_dim)`.
          attention_mask: a boolean mask of shape `(B, T, S)`, that prevents
            attention to certain positions.
          use_causal_mask: A boolean to indicate whether to apply a causal mask
            to prevent tokens from attending to future tokens (e.g., used in a
            decoder Transformer).

        Returns:
          attention_mask: a boolean mask of shape `(B, T, S)`, that prevents
            attention to certain positions, based on the Keras masks of the
            `query`, `key`, `value`, and `attention_mask` tensors, and the
            causal mask if `use_causal_mask=True`.
        Z_keras_maskN)getattrrq   castboolZnewaxis_compute_causal_mask)rH   rk   rn   rm   r   r   Z
query_maskZ
value_maskZkey_maskZ	auto_maskmaskr   r   r   r   a  s,    !
z*MultiHeadAttention._compute_attention_maskc             C   sH   t |d }|dkr|nt |d }t jt d||ft jddS )a  Computes a causal mask (e.g., for masked self-attention layers).

        For example, if query and value both contain sequences of length 4,
        this function returns a boolean `Tensor` equal to:

        ```
        [[[True,  False, False, False],
          [True,  True,  False, False],
          [True,  True,  True,  False],
          [True,  True,  True,  True]]]
        ```
        Args:
          query: query `Tensor` of shape `(B, T, ...)`.
          value: value `Tensor` of shape `(B, S, ...)` (optional, defaults to
          query).

        Returns:
          mask: a boolean `Tensor` of shape [1, T, S] containing a lower
                triangular matrix of shape [T, S].
        r   Nr}   r   )rq   ri   ZlinalgZ	band_partZonesr   )rH   rk   rn   Zq_seq_lengthZv_seq_lengthr   r   r   r     s    z'MultiHeadAttention._compute_causal_maskc             C   s   |d kr|}t |}t |}t |}|d |d krXtd|d  d|d  d|dd |dd krtd| d| | jr|d d | jS |S )Nr}   zMThe last dimension of `query_shape` and `value_shape` must be equal, but are z, z@. Received: query_shape={query_shape}, value_shape={value_shape}r   zRAll dimensions of `value` and `key`, except the last one, must be equal. Received z and )rq   rr   
ValueErrorr6   Zconcatenate)rH   rY   r[   rZ   r   r   r   compute_output_shape  s    


z'MultiHeadAttention.compute_output_shape)Nr,   TNNr-   r.   NNNNN)N)N)N)NN)NNFNF)NNF)N)N)__name__
__module____qualname____doc__r0   r]   classmethodrh   re   rt   ry   rx   r   r   r   r   r   r   __classcell__r   r   )rX   r   r+      s:   d           
O


3    
<
?
r+   )!r   r@   r   stringnumpyr   Ztensorflow.compat.v2compatZv2rq   Zkerasr   r   r   Zkeras.engine.base_layerr   Zkeras.layersr   r   r   Zkeras.utilsr	   Ztensorflow.python.platformr
   rb   Z tensorflow.python.util.tf_exportr   ascii_lowercaser   r    r'   r*   r+   r   r   r   r   <module>   s*   8