B
    @
d,                 @   s   d dl mZ d dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZmZ ddlmZ ddlmZmZmZ G dd	 d	ZG d
d dZeeef Ze
e ZG dd dZdS )    )aliases)sha256)dumps)AnyDictIteratorListOptionalTupleUnion   )TOO_BIG_SEQUENCE)	iana_nameis_multi_byte_encodingunicode_rangec               @   s  e Zd Zd8eeeedee dddZe	edddZ
e	edd	d
ZeedddZedddZedddZd ddddZeedddZeee dddZeedddZeedddZeee dddZeedddZeedd d!Zeedd"d#Zeedd$d%Zeedd&d'Zeedd(d)Zeed  dd*d+Zeedd,d-Zeee dd.d/Zeee dd0d1Z d9eed3d4d5Z!eedd6d7Z"dS ):CharsetMatchNCoherenceMatches)payloadguessed_encodingmean_mess_ratiohas_sig_or_bom	languagesdecoded_payloadc             C   sF   || _ || _|| _|| _|| _d | _g | _d| _d | _d | _	|| _
d S )Ng        )_payload	_encoding_mean_mess_ratio
_languages_has_sig_or_bom_unicode_ranges_leavesZ_mean_coherence_ratio_output_payload_output_encoding_string)selfr   r   r   r   r   r    r$   I/tmp/pip-install-587nrtaf/charset-normalizer/charset_normalizer/models.py__init__   s    	zCharsetMatch.__init__)otherreturnc             C   s>   t |ts&tdt|jt| j| j|jko<| j|jkS )Nz&__eq__ cannot be invoked on {} and {}.)
isinstancer   	TypeErrorformatstr	__class__encodingfingerprint)r#   r'   r$   r$   r%   __eq__$   s
    
zCharsetMatch.__eq__c             C   sv   t |tstt| j|j }t| j|j }|dk rj|dkrj|dkr^| j|jkr^| j|jkS | j|jkS | j|jk S )zQ
        Implemented to make sorted available upon CharsetMatches items.
        g{Gz?g{Gz?g        )r)   r   
ValueErrorabschaos	coherencemulti_byte_usage)r#   r'   Zchaos_differenceZcoherence_differencer$   r$   r%   __lt__-   s    
zCharsetMatch.__lt__)r(   c             C   s   dt t| t | j  S )Ng      ?)lenr,   raw)r#   r$   r$   r%   r5   @   s    zCharsetMatch.multi_byte_usagec             C   s"   | j d krt| j| jd| _ | j S )Nstrict)r"   r,   r   r   )r#   r$   r$   r%   __str__D   s    
zCharsetMatch.__str__c             C   s   d | j| jS )Nz<CharsetMatch '{}' bytes({})>)r+   r.   r/   )r#   r$   r$   r%   __repr__J   s    zCharsetMatch.__repr__c             C   s8   t |tr|| kr"td|jd |_| j| d S )Nz;Unable to add instance <{}> as a submatch of a CharsetMatch)r)   r   r1   r+   r-   r"   r   append)r#   r'   r$   r$   r%   add_submatchM   s    
zCharsetMatch.add_submatchc             C   s   | j S )N)r   )r#   r$   r$   r%   r.   X   s    zCharsetMatch.encodingc             C   sH   g }x>t  D ]2\}}| j|kr,|| q| j|kr|| qW |S )z
        Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
        )r   itemsr.   r<   )r#   Zalso_known_asupr$   r$   r%   encoding_aliases\   s    

zCharsetMatch.encoding_aliasesc             C   s   | j S )N)r   )r#   r$   r$   r%   bomi   s    zCharsetMatch.bomc             C   s   | j S )N)r   )r#   r$   r$   r%   byte_order_markm   s    zCharsetMatch.byte_order_markc             C   s   dd | j D S )z
        Return the complete list of possible languages found in decoded sequence.
        Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
        c             S   s   g | ]}|d  qS )r   r$   ).0er$   r$   r%   
<listcomp>w   s    z*CharsetMatch.languages.<locals>.<listcomp>)r   )r#   r$   r$   r%   r   q   s    zCharsetMatch.languagesc             C   sp   | j sbd| jkrdS ddlm}m} t| jr8|| jn|| j}t|dksVd|krZdS |d S | j d d S )z
        Most probable language found in decoded sequence. If none were detected or inferred, the property will return
        "Unknown".
        asciiZEnglishr   )encoding_languagesmb_encoding_languageszLatin BasedUnknown)r   could_be_from_charsetZcharset_normalizer.cdrH   rI   r   r.   r7   )r#   rH   rI   r   r$   r$   r%   languagey   s    

zCharsetMatch.languagec             C   s   | j S )N)r   )r#   r$   r$   r%   r3      s    zCharsetMatch.chaosc             C   s   | j s
dS | j d d S )Ng        r   r   )r   )r#   r$   r$   r%   r4      s    zCharsetMatch.coherencec             C   s   t | jd ddS )Nd      )ndigits)roundr3   )r#   r$   r$   r%   percent_chaos   s    zCharsetMatch.percent_chaosc             C   s   t | jd ddS )NrM   rN   )rO   )rP   r4   )r#   r$   r$   r%   percent_coherence   s    zCharsetMatch.percent_coherencec             C   s   | j S )z+
        Original untouched bytes.
        )r   )r#   r$   r$   r%   r8      s    zCharsetMatch.rawc             C   s   | j S )N)r   )r#   r$   r$   r%   submatch   s    zCharsetMatch.submatchc             C   s   t | jdkS )Nr   )r7   r   )r#   r$   r$   r%   has_submatch   s    zCharsetMatch.has_submatchc             C   s@   | j d k	r| j S dd t| D }ttdd |D | _ | j S )Nc             S   s   g | ]}t |qS r$   )r   )rD   charr$   r$   r%   rF      s    z*CharsetMatch.alphabets.<locals>.<listcomp>c             S   s   h | ]}|r|qS r$   r$   )rD   rr$   r$   r%   	<setcomp>   s    z)CharsetMatch.alphabets.<locals>.<setcomp>)r   r,   sortedlist)r#   Zdetected_rangesr$   r$   r%   	alphabets   s
    
zCharsetMatch.alphabetsc             C   s   | j gdd | jD  S )z
        The complete list of encoding that output the exact SAME str result and therefore could be the originating
        encoding.
        This list does include the encoding available in property 'encoding'.
        c             S   s   g | ]
}|j qS r$   )r.   )rD   mr$   r$   r%   rF      s    z6CharsetMatch.could_be_from_charset.<locals>.<listcomp>)r   r   )r#   r$   r$   r%   rK      s    z"CharsetMatch.could_be_from_charsetutf_8)r.   r(   c             C   s2   | j dks| j |kr,|| _ t| |d| _| jS )z
        Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
        Any errors will be simply ignored by the encoder NOT replaced.
        Nreplace)r!   r,   encoder    )r#   r.   r$   r$   r%   output   s    zCharsetMatch.outputc             C   s   t |   S )zw
        Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
        )r   r_   	hexdigest)r#   r$   r$   r%   r/      s    zCharsetMatch.fingerprint)N)r\   )#__name__
__module____qualname__bytesr,   floatboolr	   r&   objectr0   r6   propertyr5   r:   r;   r=   r.   r   rA   rB   rC   r   rL   r3   r4   rQ   rR   r8   rS   rT   rZ   rK   r_   r/   r$   r$   r$   r%   r   
   sR   	r   c               @   s   e Zd ZdZdeee  dddZee dddZ	e
eef ed	d
dZedddZedddZedd	ddZed dddZed dddZdS )CharsetMatchesz
    Container with every CharsetMatch items ordered by default from most probable to the less one.
    Act like a list(iterable) but does not implements all related methods.
    N)resultsc             C   s   |rt |ng | _d S )N)rX   _results)r#   rj   r$   r$   r%   r&      s    zCharsetMatches.__init__)r(   c             c   s   | j E d H  d S )N)rk   )r#   r$   r$   r%   __iter__   s    zCharsetMatches.__iter__)itemr(   c             C   sN   t |tr| j| S t |trFt|d}x| jD ]}||jkr0|S q0W tdS )z
        Retrieve a single item either by its position or encoding name (alias may be used here).
        Raise KeyError upon invalid index or encoding not present in results.
        FN)r)   intrk   r,   r   rK   KeyError)r#   rm   resultr$   r$   r%   __getitem__   s    




zCharsetMatches.__getitem__c             C   s
   t | jS )N)r7   rk   )r#   r$   r$   r%   __len__   s    zCharsetMatches.__len__c             C   s   t | jdkS )Nr   )r7   rk   )r#   r$   r$   r%   __bool__   s    zCharsetMatches.__bool__c             C   s~   t |tstdt|jt|jtkrbx4| j	D ]*}|j
|j
kr4|j|jkr4|| dS q4W | j	| t| j	| _	dS )z~
        Insert a single match. Will be inserted accordingly to preserve sort.
        Can be inserted as a submatch.
        z-Cannot append instance '{}' to CharsetMatchesN)r)   r   r1   r+   r,   r-   r7   r8   r   rk   r/   r3   r=   r<   rX   )r#   rm   matchr$   r$   r%   r<      s    

zCharsetMatches.appendr   c             C   s   | j s
dS | j d S )zQ
        Simply return the first match. Strict equivalent to matches[0].
        Nr   )rk   )r#   r$   r$   r%   best  s    zCharsetMatches.bestc             C   s   |   S )zP
        Redundant method, call the method best(). Kept for BC reasons.
        )ru   )r#   r$   r$   r%   first  s    zCharsetMatches.first)N)ra   rb   rc   __doc__r	   r   r   r&   r   rl   r   rn   r,   rq   rr   rf   rs   r<   ru   rv   r$   r$   r$   r%   ri      s   ri   c               @   sj   e Zd Zeee ee ee eee eeeee edddZe	e
eef dddZedddZd	S )
CliDetectionResult)pathr.   rA   alternative_encodingsrL   rZ   r   r3   r4   unicode_pathis_preferredc             C   sF   || _ |
| _|| _|| _|| _|| _|| _|| _|| _|	| _	|| _
d S )N)ry   r{   r.   rA   rz   rL   rZ   r   r3   r4   r|   )r#   ry   r.   rA   rz   rL   rZ   r   r3   r4   r{   r|   r$   r$   r%   r&   &  s    zCliDetectionResult.__init__)r(   c             C   s2   | j | j| j| j| j| j| j| j| j| j	| j
dS )N)ry   r.   rA   rz   rL   rZ   r   r3   r4   r{   r|   )ry   r.   rA   rz   rL   rZ   r   r3   r4   r{   r|   )r#   r$   r$   r%   __dict__@  s    zCliDetectionResult.__dict__c             C   s   t | jdddS )NT   )ensure_asciiindent)r   r}   )r#   r$   r$   r%   to_jsonP  s    zCliDetectionResult.to_jsonN)ra   rb   rc   r,   r	   r   rf   re   r&   rh   r   r   r}   r   r$   r$   r$   r%   rx   %  s   rx   N)Zencodings.aliasesr   hashlibr   jsonr   typingr   r   r   r   r	   r
   r   Zconstantr   utilsr   r   r   r   ri   r,   re   ZCoherenceMatchr   rx   r$   r$   r$   r%   <module>   s   $ UC