B
    ٻd(0                 @   s   d Z ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ	 ddl
mZ ddlmZ d	Zd
ZdZG dd dejZedddddZeddeddddZdS )z+Dataset snapshot and related functionality.    )dataset_ops)dtypes)ops)random_seed)gen_experimental_dataset_ops)deprecation)	tf_exportZGZIPZSNAPPYNc                   s"   e Zd ZdZd fdd	Z  ZS )_LegacySnapshotDatasetz<A Dataset that captures a snapshot or reads from a snapshot.Nc                sp  |d k	r|nd| _ |d k	r|nd| _|d k	r0|nd| _|d k	rB|nd| _|d k	rT|nd| _|d k	rf|nd| _|	d k	rx|	nd| _|
d k	r|
nd| _|d k	r|nd| _|d k	r|nd| _	|d k	r|nd| _
|d k	r|nd| _t|\| _| _|| _tj|tjdd| _tj| jjf| j| j | j| j| j| j| j| j| j| j| j	| j| j| j
| jd| j}tt| || d S )N Fautopath)Zdtypename)r   compressionreader_path_prefixwriter_path_prefixshard_size_bytespending_snapshot_expiry_secondsnum_reader_threadsreader_buffer_sizenum_writer_threadswriter_buffer_sizeshuffle_on_readseedZseed2modesnapshot_name)_compressionZ_reader_path_prefixZ_writer_path_prefixZ_shard_size_bytesZ _pending_snapshot_expiry_secondsZ_num_reader_threadsZ_reader_buffer_sizeZ_num_writer_threadsZ_writer_buffer_sizeZ_shuffle_on_read_modeZ_snapshot_namer   Zget_seedZ_seedZ_seed2Z_input_datasetr   Zconvert_to_tensorr   string_pathged_opsZsnapshot_datasetZ_variant_tensorZ_flat_structuresuperr	   __init__)selfinput_datasetr   r   r   r   r   r   r   r   r   r   r   shuffle_seedr   r   Zvariant_tensor)	__class__ b/var/www/html/venv/lib/python3.7/site-packages/tensorflow/python/data/experimental/ops/snapshot.pyr"       sD    
z_LegacySnapshotDataset.__init__)NNNNNNNNNNNNN)__name__
__module____qualname____doc__r"   __classcell__r'   r'   )r&   r(   r	      s               r	   z1Use `tf.data.experimental.snapshot(...)` instead.c                s*    	
fdd}|S )aV  Writes to/reads from a snapshot of a dataset.

  This function attempts to determine whether a valid snapshot exists at the
  `path`, and reads from the snapshot if so. If not, it will run the
  preprocessing pipeline as usual, and write out a snapshot of the data
  processed for future use.

  Args:
    path: A directory where we want to save our snapshots and/or read from a
      previously saved snapshot.
    compression: The type of compression to apply to the Dataset. Currently
      supports "GZIP" or None. Defaults to None (no compression).
    reader_path_prefix: A prefix to add to the path when reading from snapshots.
      Defaults to None.
    writer_path_prefix: A prefix to add to the path when writing to snapshots.
      Defaults to None.
    shard_size_bytes: The size of each shard to be written by the snapshot
      dataset op. Defaults to 10 GiB.
    pending_snapshot_expiry_seconds: How long to wait (in seconds) before the
      snapshot op considers a previously unfinished snapshot to be stale.
    num_reader_threads: Number of threads to parallelize reading from snapshot.
      Especially useful if compression is turned on since the decompression
      operation tends to be intensive. Defaults to 1. If > 1, then this might
      introduce non-determinism i.e. the order in which the elements are read
      from the snapshot are different from the order they're written.
    reader_buffer_size: Maximum number of elements we can prefetch reading from
      the snapshot. Defaults to 1. Increasing this might improve performance but
      will increase memory consumption.
    num_writer_threads: Number of threads to parallelize writing from snapshot.
      We'll open up `num_writer_threads` files and write to them in parallel.
      Especially useful if compression is turned on since the compression
      operation tends to be intensive. Defaults to 1. If > 1, then this might
      introduce non-determinism i.e. the order in which the elements are read
      from the upstream iterator are different from the order they're written.
    writer_buffer_size: Maximum number of pipeline elements to fill up the
      buffer before writing them out using `num_writer_threads`.
    shuffle_on_read: If this is True, then the order in which examples are
      produced when reading from a snapshot will be random. Defaults to False.
    shuffle_seed: Optional. If shuffle_seed is set, the random number generator
      used for shuffling (when shuffle_on_read is turned on) is seeded by the
      given seed. Otherwise, it is seeded by a random seed that differs for
      every run.
    mode: The mode at which snapshot should operate. Valid options are "auto",
      "read", "write", and "passthrough". The default mode is "auto", where the
      snapshot op will automatically determine what mode to operate in.
    snapshot_name: If set, use the supplied string as a named snapshot name
      instead of introspecting the data pipeline and automatically generating a
      unique identifier for the snapshot.

  Returns:
    A `Dataset` transformation function, which can be passed to
    `tf.data.Dataset.apply`.
  c                s&   t |  	
dS )N)r$   r   r   r   r   r   r   r   r   r   r   r   r%   r   r   )r	   )dataset)r   r   r   r   r   r   r   r   r   r   r%   r   r   r   r'   r(   	_apply_fn   s     z"legacy_snapshot.<locals>._apply_fnr'   )r   r   r   r   r   r   r   r   r   r   r   r%   r   r   r/   r'   )r   r   r   r   r   r   r   r   r   r   r%   r   r   r   r(   legacy_snapshotc   s    F&r0   z$Use `tf.data.Dataset.snapshot(...)`.zdata.experimental.snapshotAUTOc                s    fdd}|S )ak  API to persist the output of the input dataset.

  The snapshot API allows users to transparently persist the output of their
  preprocessing pipeline to disk, and materialize the pre-processed data on a
  different training run.

  This API enables repeated preprocessing steps to be consolidated, and allows
  re-use of already processed data, trading off disk storage and network
  bandwidth for freeing up more valuable CPU resources and accelerator compute
  time.

  https://github.com/tensorflow/community/blob/master/rfcs/20200107-tf-data-snapshot.md
  has detailed design documentation of this feature.

  Users can specify various options to control the behavior of snapshot,
  including how snapshots are read from and written to by passing in
  user-defined functions to the `reader_func` and `shard_func` parameters.

  `shard_func` is a user specified function that maps input elements to snapshot
  shards.

  Users may want to specify this function to control how snapshot files should
  be written to disk. Below is an example of how a potential shard_func could
  be written.

  ```python
  dataset = ...
  dataset = dataset.enumerate()
  dataset = dataset.apply(tf.data.experimental.snapshot("/path/to/snapshot/dir",
      shard_func=lambda x, y: x % NUM_SHARDS, ...))
  dataset = dataset.map(lambda x, y: y)
  ```

  `reader_func` is a user specified function that accepts a single argument:
  (1) a Dataset of Datasets, each representing a "split" of elements of the
  original dataset. The cardinality of the input dataset matches the
  number of the shards specified in the `shard_func` (see above). The function
  should return a Dataset of elements of the original dataset.

  Users may want specify this function to control how snapshot files should be
  read from disk, including the amount of shuffling and parallelism.

  Here is an example of a standard reader function a user can define. This
  function enables both dataset shuffling and parallel reading of datasets:

  ```python
  def user_reader_func(datasets):
    # shuffle the datasets splits
    datasets = datasets.shuffle(NUM_CORES)
    # read datasets in parallel and interleave their elements
    return datasets.interleave(lambda x: x, num_parallel_calls=AUTOTUNE)

  dataset = dataset.apply(tf.data.experimental.snapshot("/path/to/snapshot/dir",
      reader_func=user_reader_func))
  ```

  By default, snapshot parallelizes reads by the number of cores available on
  the system, but will not attempt to shuffle the data.

  Args:
    path: Required. A directory to use for storing / loading the snapshot to /
      from.
    compression: Optional. The type of compression to apply to the snapshot
      written to disk. Supported options are `GZIP`, `SNAPPY`, `AUTO` or None.
      Defaults to AUTO, which attempts to pick an appropriate compression
      algorithm for the dataset.
    reader_func: Optional. A function to control how to read data from snapshot
      shards.
    shard_func: Optional. A function to control how to shard data when writing a
      snapshot.

  Returns:
    A `Dataset` transformation function, which can be passed to
    `tf.data.Dataset.apply`.
  c                s   | j  dS )zActual dataset transformation.)r   r   reader_func
shard_func)snapshot)r.   )r   r   r2   r3   r'   r(   r/     s
    zsnapshot.<locals>._apply_fnr'   )r   r   r2   r3   r/   r'   )r   r   r2   r3   r(   r4      s    Or4   )NNNNNNNNNNNNN)r1   NN)r,   Ztensorflow.python.data.opsr   Ztensorflow.python.frameworkr   r   r   Ztensorflow.python.opsr   r    Ztensorflow.python.utilr   Z tensorflow.python.util.tf_exportr   ZCOMPRESSION_GZIPZCOMPRESSION_SNAPPYZCOMPRESSION_NONEZUnaryUnchangedStructureDatasetr	   
deprecatedr0   r4   r'   r'   r'   r(   <module>   s:   F            L
