B
    )`k2                 @   s2  d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlZddl	Z	ddl
mZ ddlmZ ddlmZ dd	lmZ d
ZyddlmZ ddlmZ W n ek
r   dZY nX ejZedd
d dZdZdZdZdZdZdZ dZ!dd Z"dd Z#dd Z$dd Z%d d! Z&d"d# Z'G d$d% d%e(Z)dS )&zCloud TPU Client.    )absolute_import)division)print_functionN)flags)futures)request)	HTTPErrorT)	discovery)clientFruntime_oom_exitz,Exit the script when the TPU runtime is OOM.ZKUBE_GOOGLE_CLOUD_TPU_ENDPOINTS,ZTPU_NAMEZTPU_API_DISCOVERY_URLzhttp://metadata.google.internalZ8470Z   zhttp://{}:8475/requestversionc               C   s
   t j  S )zA wrapper function around datetime.datetime.utcnow.

  This function is created for unit testing purpose. It's not easy to do
  StubOutWithMock with datetime.datetime package.

  Returns:
    datetime.datetime
  )datetimeutcnow r   r   R/home/dcms/DCMS/lib/python3.7/site-packages/tensorflow/python/tpu/client/client.py_utcnow7   s    	r   c               C   s   t jtS )N)osenvironget#_DISCOVERY_SERVICE_URL_ENV_VARIABLEr   r   r   r   _environment_discovery_urlC   s    r   c             C   s0   t jdt| f ddid}t |}t| S )Nz%s/computeMetadata/v1/%szMetadata-FlavorZGoogle)headers)r   Request_GCE_METADATA_ENDPOINTurlopen_as_textread)pathreqrespr   r   r   _request_compute_metadataG   s
    

r!   c             c   sj   xd|  dD ]V}d}||r,| |d }| d}|d }t}t|dkrV|d }||dV  qW dS )z'Yields a dict with ip address and port.r   zgrpc://   :r   )	ipAddressportN)split
startswith_DEFAULT_ENDPOINT_PORTlen)Z	endpointsZendpointZgrpc_prefixparts
ip_addressr%   r   r   r   %_environment_var_to_network_endpointsO   s    

r,   c             C   s2   | r| S x$t tgD ]}|tjkrtj| S qW d S )N)_GKE_ENV_VARIABLE_DEFAULT_ENV_VARIABLEr   r   )tpuer   r   r   _get_tpu_name`   s    
r1   c             C   s   t | tr| dS | S )Nzutf-8)
isinstancebytesdecode)sr   r   r   r   j   s    

r   c               @   s   e Zd ZdZd1ddZdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd2d,d-Zd3d/d0ZdS )4ClientzClient for working with the Cloud TPU API.

  This client is intended to be used for resolving tpu name to ip addresses.

  It's recommended to use this library as a contextlib to utilize all
  functionality.
  Ndefaultc             C   s   t |tr2|stdt|dkr*td|d }t|}|d krJtdt|| _| jd | _	|| _
d | _d | _d | _d | _| j	r|dkr|| _|rt|| _n
td| _|rt|| _ntd	}|d
d | _t p|| _d S )Nz#At least one TPU must be specified.r"   z>Using multiple TPUs in a single session is not yet implementedr   z(Please provide a TPU Name to connect to.zgrpc://r7   zproject/project-idzinstance/zone/)r2   list
ValueErrorr)   NotImplementedErrorr1   r   _tpur'   _use_api_service_credentials_project_zone_discovery_urlr!   r&   r   )selfr/   zoneprojectcredentialsserviceZdiscovery_urlZ	zone_pathr   r   r   __init__y   s8    


zClient.__init__c             C   s   d| S )z&Return the structured Symptom message.z	Symptom: r   )rD   msgr   r   r   _symptom_msg   s    zClient._symptom_msgc             C   s   |   }|sdS xtt|D ]h}|d dkr,q|d dd }tj|d}t | }|tjtdk rt	| 
d	|j d
S qW dS )z)Check if a runtime OOM event is reported.FZsymptomTypeZOUT_OF_MEMORYZ
createTime.r   z%Y-%m-%dT%H:%M:%S)secondsza recent runtime OOM has occured ~{} seconds ago. The model script will terminate automatically. To prevent future OOM events, please consider reducing the model size. To disable this behavior, set flag --runtime_oom_exit=false when starting the script.T)symptomsreversedr&   r   strptimer   	timedelta_OOM_EVENT_COOL_TIME_SECloggingwarningrK   formatrM   )rD   rN   ZsymptomZoom_datetime_strZoom_datetimeZ	time_diffr   r   r   
_oom_event   s     
zClient._oom_eventc             C   sj   | j r| j S tstd| j}|dks.|dkr8tj }| jrTtj	dd|| jddS tj	dd|ddS dS )	a  Creates a new Cloud TPU API object.

    This works around an issue where the underlying HTTP connection sometimes
    times out when the script has been running for too long. Other methods in
    this object call this method to get a new API object whenever they need
    to communicate with the Cloud API.

    Raises:
      RuntimeError: If the dependent Python packages are missing.

    Returns:
      A Google Cloud TPU API object.
    z_Missing runtime dependency on the Google API client. Run `pip install cloud-tpu-client` to fix.Nr7   r/   Zv1F)rG   ZdiscoveryServiceUrlcache_discovery)rG   rW   )
r?   _GOOGLE_API_CLIENT_INSTALLEDRuntimeErrorr@   r
   ZGoogleCredentialsZget_application_defaultrC   r	   build)rD   rG   r   r   r   _tpu_service   s     
zClient._tpu_servicec             C   s   d| j | j| jf S )z)Returns the full Cloud name for this TPU.z!projects/%s/locations/%s/nodes/%s)rA   rB   r=   )rD   r   r   r   
_full_name   s    zClient._full_namec          
   C   sf   |   }y$|   j|  d}| S  tk
r` } ztd| j	|f W dd}~X Y nX dS )z:Returns the TPU metadata object from the TPU Get API call.)namezCould not lookup TPU metadata from name '%s'. Please doublecheck the tpu argument in the TPUClusterResolver constructor. Exception: %sN)
r[   Zprojects	locationsZnodesr   r\   execute	Exceptionr;   r=   )rD   rH   rr0   r   r   r   _fetch_cloud_tpu_metadata   s    z Client._fetch_cloud_tpu_metadatac             C   s   | j r|  }||S d S )N)r>   rb   r   )rD   keymetadatar   r   r   _get_tpu_property   s    
zClient._get_tpu_propertyc             C   s
   d| _ d S )NT)_open)rD   r   r   r   	__enter__   s    zClient.__enter__c             C   s
   ~~~d S )Nr   )rD   typevalue	tracebackr   r   r   __exit__   s    zClient.__exit__c             C   s.   |   }|r|dkrdS tjr*|  r*dS dS )zReturns true if the TPU is in a state where training should eventually resume.

    If false the TPU is in a unrecoverable state and should be recreated.
    )Z
TERMINATEDZ	PREEMPTEDFT)stateFLAGSr   rV   )rD   rl   r   r   r   recoverable   s    zClient.recoverablec             C   s
   |  dS )z%Return Cloud TPU Symptoms of the TPU.rN   )re   )rD   r   r   r   rN     s    zClient.symptomsc             C   s
   |  dS )zReturn state of the TPU.rl   )re   )rD   r   r   r   rl     s    zClient.statec             C   s
   |  dS )zReturn health of the TPU.health)re   )rD   r   r   r   ro     s    zClient.healthc          
   C   s   | j st|  d d }y,t|}t|}t|	 }|
dS  tk
r } z|j}|dkrldS |W dd}~X Y nX | dS )z"Return runtime version of the TPU.r   r$   ZcurrentVersioni  NZtensorflowVersion)r>   _VERSION_SWITCHER_ENDPOINTrU   network_endpointsr   r   r   jsonloadsr   r   r   codere   )rD   urlr   r    Zversion_detailsr0   status_coder   r   r   runtime_version  s    


zClient.runtime_versionc             C   s
   |  dS )z#Return accelerator type of the TPU.ZacceleratorType)re   )rD   r   r   r   accelerator_type,  s    zClient.accelerator_typec             C   s   | j S )zPReturn if the Cloud TPU API is available, if not certain features will not work.)r>   )rD   r   r   r   api_available0  s    zClient.api_availablec             C   s   | j S )zFReturn the name of the tpu, or the ip address if name is not provided.)r=   )rD   r   r   r   r]   4  s    zClient.namec             C   s   t dS )zNReturn the local ip address of the Google Cloud VM the workload is running on.z instance/network-interfaces/0/ip)r!   )rD   r   r   r   get_local_ip8  s    zClient.get_local_ipc             C   sj   | j stt| jS |  }|ddkrBtd| j|df d|krR|d S |d |d dgS dS )	zReturn a list of tpu endpoints.rl   ZREADYz&TPU "%s" is not yet ready; state: "%s"ZnetworkEndpointsr$   r%   )r$   r%   N)r>   r:   r,   r=   rb   r   rY   )rD   responser   r   r   rq   <  s    zClient.network_endpoints     c             C   sx   t   | }xV|  dkrbtd|  |  |   t   | |krVtd|   t | qW td|   dS )a?  Wait for TPU to become healthy or raise error if timeout reached.

    Args:
      timeout_s (int): The timeout in seconds for waiting TPU to become healthy.
      interval (int): The interval in seconds to poll the TPU for health.

    Raises:
      RuntimeError: If the TPU doesn't become healthy by the timeout.
    ZHEALTHYzFWaiting for TPU "%s" with state "%s" and health "%s" to become healthyz0Timed out waiting for TPU "%s" to become healthyzTPU "%s" is healthy.N)timero   rS   rT   r]   rl   rY   sleep)rD   Z	timeout_sintervaltimeoutr   r   r   wait_for_healthyJ  s    
zClient.wait_for_healthyalwaysc          	      s\    fdd}|   }tjt|d,}|||}x|D ]}|r:|  q:W W dQ R X dS )a  Configure TPU software version.

    Args:
      version (string): Version of software to configure the TPU with.
      restart_type (string): Restart behaviour when switching versions,
        defaults to always restart. Options are 'always', 'ifNeeded'.

    c          
      s   | d }t d | }tj|dd}yt| W nN tk
r } z0|j}|dkrftdntd|W dd}~X Y nX dS )	zConfigure individual TPU worker.

      Args:
        worker: A dict with the field ipAddress where the configure request will
          be sent.
      r$   z/{}?restartType={}    )datai  zTensorflow version {} is not available on Cloud TPU, try a previous nightly version or refer to https://cloud.google.com/tpu/docs/release-notes for the latest official version.zFailed to configure worker {}N)rp   rU   r   r   r   r   rt   r`   )Zworkerr+   ru   r   r0   rv   )restart_typeversionr   r   configure_workerk  s    

z6Client.configure_tpu_version.<locals>.configure_worker)max_workersN)rq   r   ZThreadPoolExecutorr)   mapresult)rD   r   r   r   workersexecutorresultsr   r   )r   r   r   configure_tpu_versiona  s    

zClient.configure_tpu_version)NNNr7   NN)r|   r}   )r   )__name__
__module____qualname____doc__rI   rK   rV   r[   r\   rb   re   rg   rk   rn   rN   rl   ro   rw   rx   ry   r]   rz   rq   r   r   r   r   r   r   r6   p   s6        
&$
r6   )*r   
__future__r   r   r   r   rr   rS   r   r~   Zabslr   Z
concurrentr   Zsix.moves.urllibr   Zsix.moves.urllib.errorr   rX   Zgoogleapiclientr	   Zoauth2clientr
   ImportErrorrm   ZDEFINE_boolr-   Z_ENDPOINTS_SEPARATORr.   r   r   r(   rR   rp   r   r   r!   r,   r1   r   objectr6   r   r   r   r   <module>   sH   

