B
    `]                 @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlm  m	Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddddgZeZejZejZG dd dejZG dd dejZ G dd dej!Z"G dd dej#Z$dS )z+Affine layers for building neural networks.    )absolute_import)division)print_functionN)distribution)normal)layers)util)variational_base)prefer_staticAffineAffineVariationalFlipout#AffineVariationalReparameterization(AffineVariationalReparameterizationLocalc                   s4   e Zd ZdZddejejdddf fdd	Z  Z	S )r   zBasic affine layer.N c
                s   |dkrt jg t jdntj|dgd}t|}
t|
dkrV||g}|g}tj}n2tj	|||ggdd}tj	||ggdd}dd }t|}
||||||
|
|\}}|| _
tt| j||||||	d	 dS )
ak  Constructs layer.

    Args:
      input_size: ...
      output_size: ...
      init_kernel_fn: ...
        Default value: `None` (i.e.,
        `tfp.experimental.nn.initializers.glorot_uniform()`).
      init_bias_fn: ...
        Default value: `None` (i.e., `tf.initializers.zeros()`).
      make_kernel_bias_fn: ...
        Default value: `tfp.experimental.nn.util.make_kernel_bias`.
      dtype: ...
        Default value: `tf.float32`.
      batch_shape: ...
        Default value: `()`.
      activation_fn: ...
        Default value: `None`.
      name: ...
        Default value: `None` (i.e., `'Affine'`).
    N)dtype)shaper   )Zaxisc             S   s   t jj|| ddS )NT)Z	adjoint_a)tfZlinalgZmatvec)xkr   r   j/home/dcms/DCMS/lib/python3.7/site-packages/tensorflow_probability/python/experimental/nn/affine_layers.py<lambda>d       z!Affine.__init__.<locals>.<lambda>)kernelbiasapply_kernel_fnactivation_fnr   name)nparrayint32r
   Zreshapesizer   Zget_static_valuematmulconcatZ_make_kernel_bias_fnsuperr   __init__)self
input_sizeoutput_sizeinit_kernel_fninit_bias_fnZmake_kernel_bias_fnr   Zbatch_shaper   r   batch_ndimsZkernel_shapeZ
bias_shaper   r   r   )	__class__r   r   r%   1   s4    "



zAffine.__init__)
__name__
__module____qualname____doc__nn_util_libZmake_kernel_biasr   float32r%   __classcell__r   r   )r,   r   r   .   s   c                   sD   e Zd ZdZddejejejj	e
ejdedddf fdd	Z  ZS )r   a  Densely-connected layer class with reparameterization estimator.

  This layer implements the Bayesian variational inference analogue to
  a dense layer by assuming the `kernel` and/or the `bias` are drawn
  from distributions. By default, the layer implements a stochastic
  forward pass via sampling from the kernel and bias posteriors,

  ```none
  kernel, bias ~ posterior
  outputs = matmul(inputs, kernel) + bias
  ```

  It uses the reparameterization estimator [(Kingma and Welling, 2014)][1],
  which performs a Monte Carlo approximation of the distribution integrating
  over the `kernel` and `bias`.

  The arguments permit separate specification of the surrogate posterior
  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
  distributions.

  Upon being built, this layer adds losses (accessible via the `losses`
  property) representing the divergences of `kernel` and/or `bias` surrogate
  posteriors and their respective priors. When doing minibatch stochastic
  optimization, make sure to scale this loss such that it is applied just once
  per epoch (e.g. if `kl` is the sum of `losses` for each element of the batch,
  you should pass `kl / num_examples_per_epoch` to your optimizer).

  You can access the `kernel` and/or `bias` posterior and prior distributions
  after the layer is built via the `kernel_posterior`, `kernel_prior`,
  `bias_posterior` and `bias_prior` properties.

  #### Examples

  We illustrate a Bayesian neural network with [variational inference](
  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
  assuming a dataset of images and length-10 one-hot `targets`.

  ```python
  import functools
  import tensorflow.compat.v2 as tf
  import tensorflow_probability as tfp
  import tensorflow_datasets as tfds
  tfb = tfp.bijectors
  tfd = tfp.distributions
  tfn = tfp.experimental.nn

  # 1  Prepare Dataset

  [train_dataset, eval_dataset], datasets_info = tfds.load(
      name='mnist',
      split=['train', 'test'],
      with_info=True,
      as_supervised=True,
      shuffle_files=True)
  def _preprocess(image, label):
    # image = image < tf.random.uniform(tf.shape(image))   # Randomly binarize.
    image = tf.cast(image, tf.float32) / 255.  # Scale to unit interval.
    lo = 0.001
    image = (1. - 2. * lo) * image + lo  # Rescale to *open* unit interval.
    return image, label
  batch_size = 32
  train_size = datasets_info.splits['train'].num_examples
  train_dataset = tfn.util.tune_dataset(
      train_dataset,
      batch_shape=(batch_size,),
      shuffle_size=int(train_size / 7),
      preprocess_fn=_preprocess)
  train_iter = iter(train_dataset)
  eval_iter = iter(eval_dataset)
  x, y = next(train_iter)
  evidence_shape = x.shape[1:]
  targets_shape = y.shape[1:]

  # 2  Specify Model

  BayesConv2D = functools.partial(
      tfn.ConvolutionVariationalReparameterization,
      rank=2,
      padding='same',
      filter_shape=5,
      # Use `he_uniform` because we'll use the `relu` family.
      init_kernel_fn=tf.initializers.he_uniform())

  BayesAffine = functools.partial(
      tfn.AffineVariationalReparameterization,
      init_kernel_fn=tf.initializers.he_normal())

  scale = tfp.util.TransformedVariable(1., tfb.Softplus())
  bnn = tfn.Sequential([
      BayesConv2D(evidence_shape[-1], 32, filter_shape=7, strides=2,
                  activation_fn=tf.nn.leaky_relu),           # [b, 14, 14, 32]
      tfn.util.flatten_rightmost(ndims=3),                   # [b, 14 * 14 * 32]
      BayesAffine(14 * 14 * 32, np.prod(target_shape) - 1),  # [b, 9]
      tfn.Lambda(
          eval_fn=lambda loc: tfb.SoftmaxCentered()(
              tfd.Independent(tfd.Normal(loc, scale),
                              reinterpreted_batch_ndims=1)),
          also_track=scale),                                 # [b, 10]
  ], name='bayesian_neural_network')

  print(bnn.summary())

  # 3  Train.

  def loss_fn():
    x, y = next(train_iter)
    nll = -tf.reduce_mean(bnn(x).log_prob(y), axis=-1)
    kl = bnn.extra_loss / tf.cast(train_size, tf.float32)
    loss = nll + kl
    return loss, (nll, kl)
  opt = tf.optimizers.Adam()
  fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables)
  for _ in range(200):
    loss, (nll, kl), g = fit_op()
  ```

  This example uses reparameterization gradients to minimize the
  Kullback-Leibler divergence up to a constant, also known as the negative
  Evidence Lower Bound. It consists of the sum of two terms: the expected
  negative log-likelihood, which we approximate via Monte Carlo; and the KL
  divergence, which is added via regularizer terms which are arguments to the
  layer.

  #### References

  [1]: Diederik Kingma and Max Welling. Auto-Encoding Variational Bayes. In
       _International Conference on Learning Representations_, 2014.
       https://arxiv.org/abs/1312.6114
  Nc                sh   || _ || _d}tt| j|||g|g|||||	|||g|g|||||	tj|||	|
||||d dS )am  Constructs layer.

    Args:
      input_size: ...
      output_size: ...
      init_kernel_fn: ...
        Default value: `None` (i.e.,
        `tfp.experimental.nn.initializers.glorot_uniform()`).
      init_bias_fn: ...
        Default value: `None` (i.e., `tf.initializers.zeros()`).
      make_posterior_fn: ...
        Default value:
          `tfp.experimental.nn.util.make_kernel_bias_posterior_mvn_diag`.
      make_prior_fn: ...
        Default value:
          `tfp.experimental.nn.util.make_kernel_bias_prior_spike_and_slab`.
      posterior_value_fn: ...
        Default valye: `tfd.Distribution.sample`
      unpack_weights_fn:
        Default value: `unpack_kernel_and_bias`
      dtype: ...
        Default value: `tf.float32`.
      penalty_weight: ...
        Default value: `None` (i.e., weight is `1`).
      posterior_penalty_fn: ...
        Default value: `kl_divergence_monte_carlo`.
      activation_fn: ...
        Default value: `None`.
      seed: ...
        Default value: `None` (i.e., no seed).
      name: ...
        Default value: `None` (i.e.,
        `'AffineVariationalReparameterization'`).
    r   )	posteriorpriorr   posterior_value_fnunpack_weights_fnr   penalty_weightposterior_penalty_fnr   seedr   N)_make_posterior_fn_make_prior_fnr$   r   r%   r   r"   )r&   r'   r(   r)   r*   make_posterior_fnmake_prior_fnr6   r7   r   r8   r9   r   r:   r   r+   )r,   r   r   r%      s.    5


z,AffineVariationalReparameterization.__init__)r-   r.   r/   r0   r1   #make_kernel_bias_posterior_mvn_diag%make_kernel_bias_prior_spike_and_slabtfdDistributionsampleunpack_kernel_and_biasr   r2   kl_divergence_monte_carlor%   r3   r   r   )r,   r   r   u   s    c                   sD   e Zd ZdZddejejejj	e
ejdedddf fdd	Z  ZS )r   a	  Densely-connected layer class with Flipout estimator.

  This layer implements the Bayesian variational inference analogue to
  a dense layer by assuming the `kernel` and/or the `bias` are drawn
  from distributions. By default, the layer implements a stochastic
  forward pass via sampling from the kernel and bias posteriors,

  ```none
  kernel, bias ~ posterior
  outputs = tf.linalg.matmul(inputs, kernel) + bias
  ```

  It uses the Flipout estimator [(Wen et al., 2018)][1], which performs a Monte
  Carlo approximation of the distribution integrating over the `kernel` and
  `bias`. Flipout uses roughly twice as many floating point operations as the
  reparameterization estimator but has the advantage of significantly lower
  variance.

  The arguments permit separate specification of the surrogate posterior
  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
  distributions.

  Upon being built, this layer adds losses (accessible via the `losses`
  property) representing the divergences of `kernel` and/or `bias` surrogate
  posteriors and their respective priors. When doing minibatch stochastic
  optimization, make sure to scale this loss such that it is applied just once
  per epoch (e.g. if `kl` is the sum of `losses` for each element of the batch,
  you should pass `kl / num_examples_per_epoch` to your optimizer).

  #### Examples

  We illustrate a Bayesian neural network with [variational inference](
  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
  assuming a dataset of images and length-10 one-hot `targets`.

  ```python
  # Using the following substitution, see:
  tfn = tfp.experimental.nn
  help(tfn.AffineVariationalReparameterization)
  BayesAffine = tfn.AffineVariationalFlipout
  ```

  This example uses reparameterization gradients to minimize the
  Kullback-Leibler divergence up to a constant, also known as the negative
  Evidence Lower Bound. It consists of the sum of two terms: the expected
  negative log-likelihood, which we approximate via Monte Carlo; and the KL
  divergence, which is added via regularizer terms which are arguments to the
  layer.

  #### References

  [1]: Yeming Wen, Paul Vicol, Jimmy Ba, Dustin Tran, and Roger Grosse. Flipout:
       Efficient Pseudo-Independent Weight Perturbations on Mini-Batches. In
       _International Conference on Learning Representations_, 2018.
       https://arxiv.org/abs/1803.04386
  Nc                sh   || _ || _d}tt| j|||g|g|||||	|||g|g|||||	tj|||	|
||||d dS )ab  Constructs layer.

    Args:
      input_size: ...
      output_size: ...
      init_kernel_fn: ...
        Default value: `None` (i.e.,
        `tfp.experimental.nn.initializers.glorot_uniform()`).
      init_bias_fn: ...
        Default value: `None` (i.e., `tf.initializers.zeros()`).
      make_posterior_fn: ...
        Default value:
          `tfp.experimental.nn.util.make_kernel_bias_posterior_mvn_diag`.
      make_prior_fn: ...
        Default value:
          `tfp.experimental.nn.util.make_kernel_bias_prior_spike_and_slab`.
      posterior_value_fn: ...
        Default valye: `tfd.Distribution.sample`
      unpack_weights_fn:
        Default value: `unpack_kernel_and_bias`
      dtype: ...
        Default value: `tf.float32`.
      penalty_weight: ...
        Default value: `None` (i.e., weight is `1`).
      posterior_penalty_fn: ...
        Default value: `kl_divergence_monte_carlo`.
      activation_fn: ...
        Default value: `None`.
      seed: ...
        Default value: `None` (i.e., no seed).
      name: ...
        Default value: `None` (i.e.,
        `'AffineVariationalFlipout'`).
    r   )r4   r5   r   r6   r7   r   r8   r9   r   r:   r   N)r;   r<   r$   r   r%   r   r"   )r&   r'   r(   r)   r*   r=   r>   r6   r7   r   r8   r9   r   r:   r   r+   )r,   r   r   r%     s.    5


z!AffineVariationalFlipout.__init__)r-   r.   r/   r0   r1   r?   r@   rA   rB   rC   rD   r   r2   rE   r%   r3   r   r   )r,   r   r   G  s   8c                   sX   e Zd ZdZddejejejj	e
ejdedddf fdd	Zedd Zdd Z  ZS )	r   a  Densely-connected layer class with local reparameterization estimator.

  This layer implements the Bayesian variational inference analogue to
  a dense layer by assuming the `kernel` and/or the `bias` are drawn
  from distributions. By default, the layer implements a stochastic
  forward pass via sampling from the kernel and bias posteriors,

  ```none
  kernel, bias ~ posterior
  outputs = matmul(inputs, kernel) + bias
  ```

  It uses the local reparameterization estimator [(Kingma et al., 2015)][1],
  which performs a Monte Carlo approximation of the distribution on the hidden
  units induced by the `kernel` and `bias`. The default `kernel_posterior_fn`
  is a normal distribution which factorizes across all elements of the weight
  matrix and bias vector. Unlike [1]'s multiplicative parameterization, this
  distribution has trainable location and scale parameters which is known as
  an additive noise parameterization [(Molchanov et al., 2017)][2].

  The arguments permit separate specification of the surrogate posterior
  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
  distributions.

  Upon being built, this layer adds losses (accessible via the `losses`
  property) representing the divergences of `kernel` and/or `bias` surrogate
  posteriors and their respective priors. When doing minibatch stochastic
  optimization, make sure to scale this loss such that it is applied just once
  per epoch (e.g. if `kl` is the sum of `losses` for each element of the batch,
  you should pass `kl / num_examples_per_epoch` to your optimizer).

  You can access the `kernel` and/or `bias` posterior and prior distributions
  after the layer is built via the `kernel_posterior`, `kernel_prior`,
  `bias_posterior` and `bias_prior` properties.

  #### Examples

  We illustrate a Bayesian neural network with [variational inference](
  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
  assuming a dataset of images and length-10 one-hot `targets`.

  ```python
  # Using the following substitution, see:
  tfn = tfp.experimental.nn
  help(tfn.AffineVariationalReparameterization)
  BayesAffine =  tfn.AffineVariationalReparameterizationLocal
  ```

  This example uses reparameterization gradients to minimize the
  Kullback-Leibler divergence up to a constant, also known as the negative
  Evidence Lower Bound. It consists of the sum of two terms: the expected
  negative log-likelihood, which we approximate via Monte Carlo; and the KL
  divergence, which is added via regularizer terms which are arguments to the
  layer.

  #### References

  [1]: Diederik Kingma, Tim Salimans, and Max Welling. Variational Dropout and
       the Local Reparameterization Trick. In _Neural Information Processing
       Systems_, 2015. https://arxiv.org/abs/1506.02557
  [2]: Dmitry Molchanov, Arsenii Ashukha, Dmitry Vetrov. Variational Dropout
       Sparsifies Deep Neural Networks. In _International Conference on Machine
       Learning_, 2017. https://arxiv.org/abs/1701.05369
  Nc                sh   || _ || _d}tt| j|||g|g|||||	|||g|g|||||	|	|
|||||d	 || _dS )ab  Constructs layer.

    Args:
      input_size: ...
      output_size: ...
      init_kernel_fn: ...
        Default value: `None` (i.e.,
        `tfp.experimental.nn.initializers.glorot_uniform()`).
      init_bias_fn: ...
        Default value: `None` (i.e., `tf.initializers.zeros()`).
      make_posterior_fn: ...
        Default value:
          `tfp.experimental.nn.util.make_kernel_bias_posterior_mvn_diag`.
      make_prior_fn: ...
        Default value:
          `tfp.experimental.nn.util.make_kernel_bias_prior_spike_and_slab`.
      posterior_value_fn: ...
        Default valye: `tfd.Distribution.sample`
      unpack_weights_fn:
        Default value: `unpack_kernel_and_bias`
      dtype: ...
        Default value: `tf.float32`.
      penalty_weight: ...
        Default value: `None` (i.e., weight is `1`).
      posterior_penalty_fn: ...
        Default value: `kl_divergence_monte_carlo`.
      activation_fn: ...
        Default value: `None`.
      seed: ...
        Default value: `None` (i.e., no seed).
      name: ...
        Default value: `None` (i.e.,
        `'AffineVariationalFlipout'`).
    r   )	r4   r5   r   r8   r9   r6   r   r:   r   N)r;   r<   r$   r   r%   _unpack_weights_fn)r&   r'   r(   r)   r*   r=   r>   r6   r7   r   r8   r9   r   r:   r   r+   )r,   r   r   r%     s,    5


z1AffineVariationalReparameterizationLocal.__init__c             C   s   | j S )N)rF   )r&   r   r   r   r7   ]  s    z:AffineVariationalReparameterizationLocal.unpack_weights_fnc             C   s   |  | jj|dd \}}t|\}}t||}ttt|t|}|  |\}	}
|
d k	ryt|\}}d}W n t	k
r   d}Y nX |r|| }tt|t| }n||
 }t
j||dj|  d}| jd k	r| |}|S )N)valuer   TF)locscale)r:   )r7   r4   Zsample_distributionsvi_libZget_spherical_normal_loc_scaler   r"   sqrtZsquare	TypeError
normal_libZNormalrC   Z_seedr   )r&   r   weightsZkernel_distZ	bias_distZ
kernel_locZkernel_scalerH   rI   _Zsampled_biasZbias_locZ
bias_scaleZis_bias_spherical_normalyr   r   r   _evala  s,    




z.AffineVariationalReparameterizationLocal._eval)r-   r.   r/   r0   r1   r?   r@   rA   rB   rC   rD   r   r2   rE   r%   propertyr7   rQ   r3   r   r   )r,   r   r     s   @:)%r0   
__future__r   r   r   numpyr   Ztensorflow.compat.v2compatZv2r   Z+tensorflow_probability.python.distributionsr   Zdistribution_libr   rM   Z-tensorflow_probability.python.experimental.nnr   Z
layers_libr   r1   r	   rJ   Z&tensorflow_probability.python.internalr
   __all__rA   rE   rD   ZKernelBiasLayerr   Z,VariationalReparameterizationKernelBiasLayerr   Z!VariationalFlipoutKernelBiasLayerr   ZVariationalLayerr   r   r   r   r   <module>   s2   G
 R 	