B
    @
dRG                 @   sZ  d dl mZ d dlmZ d dlmZmZ ddlmZm	Z	m
Z
 ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ G dd dZG d	d
 d
eZG dd deZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$eddee% ee% e&dddZ'eddd%e%e(e&e(d!d"d#Z)d$S )&    )	lru_cache)	getLogger)ListOptional   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuatedis_asciiis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec               @   sP   e Zd ZdZeedddZeddddZddd	d
Ze	e
dddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    )	characterreturnc             C   s   t dS )z@
        Determine if given character should be fed in.
        N)NotImplementedError)selfr    r   E/tmp/pip-install-587nrtaf/charset-normalizer/charset_normalizer/md.pyeligible$   s    zMessDetectorPlugin.eligibleNc             C   s   t dS )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        N)r   )r   r   r   r   r    feed*   s    zMessDetectorPlugin.feed)r   c             C   s   t dS )zB
        Permit to reset the plugin to the initial state.
        N)r   )r   r   r   r    reset1   s    zMessDetectorPlugin.resetc             C   s   t dS )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        N)r   )r   r   r   r    ratio7   s    zMessDetectorPlugin.ratio)__name__
__module____qualname____doc__strboolr!   r"   r#   propertyfloatr$   r   r   r   r    r      s   r   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS ) TooManySymbolOrPunctuationPluginN)r   c             C   s"   d| _ d| _d| _d | _d| _d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_charZ_frenzy_symbol_in_word)r   r   r   r    __init__A   s
    z)TooManySymbolOrPunctuationPlugin.__init__)r   r   c             C   s   |  S )N)isprintable)r   r   r   r   r    r!   I   s    z)TooManySymbolOrPunctuationPlugin.eligiblec             C   sp   |  j d7  _ || jkrf|tkrft|r8|  jd7  _n.| dkrft|rft|dkrf|  jd7  _|| _d S )Nr   F   )	r0   r1   r   r   r.   isdigitr   r   r/   )r   r   r   r   r    r"   L   s    
z%TooManySymbolOrPunctuationPlugin.feedc             C   s   d| _ d| _d| _d S )Nr   )r.   r0   r/   )r   r   r   r    r#   ^   s    z&TooManySymbolOrPunctuationPlugin.resetc             C   s0   | j dkrdS | j| j | j  }|dkr,|S dS )Nr   g        g333333?)r0   r.   r/   )r   Zratio_of_punctuationr   r   r    r$   c   s
    

z&TooManySymbolOrPunctuationPlugin.ratio)r%   r&   r'   r2   r)   r*   r!   r"   r#   r+   r,   r$   r   r   r   r    r-   @   s   r-   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )TooManyAccentuatedPluginN)r   c             C   s   d| _ d| _d S )Nr   )r0   _accentuated_count)r   r   r   r    r2   p   s    z!TooManyAccentuatedPlugin.__init__)r   r   c             C   s   |  S )N)isalpha)r   r   r   r   r    r!   t   s    z!TooManyAccentuatedPlugin.eligiblec             C   s(   |  j d7  _ t|r$|  jd7  _d S )Nr   )r0   r
   r7   )r   r   r   r   r    r"   w   s    zTooManyAccentuatedPlugin.feedc             C   s   d| _ d| _d S )Nr   )r0   r7   )r   r   r   r    r#   }   s    zTooManyAccentuatedPlugin.resetc             C   s4   | j dks| j dk rdS | j| j  }|dkr0|S dS )Nr      g        gffffff?)r0   r7   )r   Zratio_of_accentuationr   r   r    r$      s    zTooManyAccentuatedPlugin.ratio)r%   r&   r'   r2   r)   r*   r!   r"   r#   r+   r,   r$   r   r   r   r    r6   o   s   r6   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )UnprintablePluginN)r   c             C   s   d| _ d| _d S )Nr   )_unprintable_countr0   )r   r   r   r    r2      s    zUnprintablePlugin.__init__)r   r   c             C   s   dS )NTr   )r   r   r   r   r    r!      s    zUnprintablePlugin.eligiblec             C   s(   t |r|  jd7  _|  jd7  _d S )Nr   )r   r;   r0   )r   r   r   r   r    r"      s    zUnprintablePlugin.feedc             C   s
   d| _ d S )Nr   )r;   )r   r   r   r    r#      s    zUnprintablePlugin.resetc             C   s   | j dkrdS | jd | j  S )Nr   g        r9   )r0   r;   )r   r   r   r    r$      s    
zUnprintablePlugin.ratio)r%   r&   r'   r2   r)   r*   r!   r"   r#   r+   r,   r$   r   r   r   r    r:      s   r:   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )SuspiciousDuplicateAccentPluginN)r   c             C   s   d| _ d| _d | _d S )Nr   )_successive_countr0   _last_latin_character)r   r   r   r    r2      s    z(SuspiciousDuplicateAccentPlugin.__init__)r   r   c             C   s   |  ot|S )N)r8   r   )r   r   r   r   r    r!      s    z(SuspiciousDuplicateAccentPlugin.eligiblec             C   st   |  j d7  _ | jd k	rjt|rjt| jrj| rJ| j rJ|  jd7  _t|t| jkrj|  jd7  _|| _d S )Nr   )r0   r>   r
   isupperr=   r   )r   r   r   r   r    r"      s    

z$SuspiciousDuplicateAccentPlugin.feedc             C   s   d| _ d| _d | _d S )Nr   )r=   r0   r>   )r   r   r   r    r#      s    z%SuspiciousDuplicateAccentPlugin.resetc             C   s   | j dkrdS | jd | j  S )Nr   g        r4   )r0   r=   )r   r   r   r    r$      s    
z%SuspiciousDuplicateAccentPlugin.ratio)r%   r&   r'   r2   r)   r*   r!   r"   r#   r+   r,   r$   r   r   r   r    r<      s   r<   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )SuspiciousRangeN)r   c             C   s   d| _ d| _d | _d S )Nr   )"_suspicious_successive_range_countr0   _last_printable_seen)r   r   r   r    r2      s    zSuspiciousRange.__init__)r   r   c             C   s   |  S )N)r3   )r   r   r   r   r    r!      s    zSuspiciousRange.eligiblec             C   sx   |  j d7  _ | s&t|s&|tkr0d | _d S | jd krD|| _d S t| j}t|}t||rn|  jd7  _|| _d S )Nr   )r0   isspacer   r   rB   r    is_suspiciously_successive_rangerA   )r   r   unicode_range_aunicode_range_br   r   r    r"      s    


zSuspiciousRange.feedc             C   s   d| _ d| _d | _d S )Nr   )r0   rA   rB   )r   r   r   r    r#      s    zSuspiciousRange.resetc             C   s.   | j dkrdS | jd | j  }|dk r*dS |S )Nr   g        r4   g?)r0   rA   )r   Zratio_of_suspicious_range_usager   r   r    r$      s    
zSuspiciousRange.ratio)r%   r&   r'   r2   r)   r*   r!   r"   r#   r+   r,   r$   r   r   r   r    r@      s   r@   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )SuperWeirdWordPluginN)r   c             C   s:   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d S )Nr   F )	_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr0   _bad_character_count_buffer_buffer_accent_count)r   r   r   r    r2      s    zSuperWeirdWordPlugin.__init__)r   r   c             C   s   dS )NTr   )r   r   r   r   r    r!   	  s    zSuperWeirdWordPlugin.eligiblec             C   s  |  r|  j|7  _t|r,|  jd7  _| jdkrt|dksJt|rt|dkrt|dkrt|dkrt	|dkrt
|dkrd| _d S | jsd S | st|st|r| jr|  jd7  _t| j}|  j|7  _|dkr6| j| dkrd| _t| jd r6| jd  r6|  jd7  _d| _|dkr\| jr\|  jd7  _d| _| jr|  jd7  _|  jt| j7  _d| _d| _d| _d	| _n6|d
kr| dkrt|rd| _|  j|7  _d S )Nr   FT   g(\?   rH   r   >   |>=-<_~)r8   rO   r
   rP   rM   r   r   r   r   r   r   rC   r   r   rI   lenr0   rL   r?   rK   rJ   rN   r5   r   )r   r   Zbuffer_lengthr   r   r    r"     sR    


 

zSuperWeirdWordPlugin.feedc             C   s4   d| _ d| _d| _d| _d| _d| _d| _d| _d S )NrH   Fr   )rO   rL   rM   rJ   rI   r0   rN   rK   )r   r   r   r    r#   B  s    zSuperWeirdWordPlugin.resetc             C   s$   | j dkr| jdkrdS | j| j S )N
   r   g        )rI   rK   rN   r0   )r   r   r   r    r$   L  s    zSuperWeirdWordPlugin.ratio)r%   r&   r'   r2   r)   r*   r!   r"   r#   r+   r,   r$   r   r   r   r    rG      s   6
rG   c               @   s^   e Zd ZdZddddZeedddZeddd	d
ZddddZ	e
edddZdS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    N)r   c             C   s   d| _ d| _d S )Nr   )_wrong_stop_count_cjk_character_count)r   r   r   r    r2   Z  s    zCjkInvalidStopPlugin.__init__)r   r   c             C   s   dS )NTr   )r   r   r   r   r    r!   ^  s    zCjkInvalidStopPlugin.eligiblec             C   s4   |dkr|  j d7  _ d S t|r0|  jd7  _d S )N>      丅   丄r   )r^   r   r_   )r   r   r   r   r    r"   a  s
    zCjkInvalidStopPlugin.feedc             C   s   d| _ d| _d S )Nr   )r^   r_   )r   r   r   r    r#   h  s    zCjkInvalidStopPlugin.resetc             C   s   | j dk rdS | j| j  S )N   g        )r_   r^   )r   r   r   r    r$   l  s    
zCjkInvalidStopPlugin.ratio)r%   r&   r'   r(   r2   r)   r*   r!   r"   r#   r+   r,   r$   r   r   r   r    r]   T  s   r]   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )ArchaicUpperLowerPluginN)r   c             C   s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr0   _last_alpha_seen_current_ascii_only)r   r   r   r    r2   t  s    z ArchaicUpperLowerPlugin.__init__)r   r   c             C   s   dS )NTr   )r   r   r   r   r    r!     s    z ArchaicUpperLowerPlugin.eligiblec             C   s$  |  ot|}|dk}|r| jdkr| jdkrV| dkrV| jdkrV|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jdkrt
|dkrd| _| jd k	r| r| j s| r| j r| jdkr|  jd7  _d| _qd| _nd| _|  j	d7  _	|  jd7  _|| _d S )NFr   @   r   Tr4   )r8   r   re   r5   ri   rg   rf   rh   rd   r0   r   r?   islower)r   r   Zis_concernedZ	chunk_sepr   r   r    r"     s8    




zArchaicUpperLowerPlugin.feedc             C   s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)r0   re   rf   rg   rh   rd   ri   )r   r   r   r    r#     s    zArchaicUpperLowerPlugin.resetc             C   s   | j dkrdS | j| j  S )Nr   g        )r0   rg   )r   r   r   r    r$     s    
zArchaicUpperLowerPlugin.ratio)r%   r&   r'   r2   r)   r*   r!   r"   r#   r+   r,   r$   r   r   r   r    rc   s  s   *	rc   i   )maxsize)rE   rF   r   c             C   s~  | dks|dkrdS | |kr dS d| kr4d|kr4dS d| ksDd|krHdS d| ksXd|krld| kshd|krldS |  d| d }}x"|D ]}|tkrq||krdS qW | dk|dk }}|s|rd	| ksd	|krdS |r|rdS d
| ksd
|kr"d	| ksd	|kr
dS | dks|dkr"dS d	| ksJd	|ksJ| dkrz|dkrzd| ks^d|krbdS d| ksvd|krzdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFZLatinZ	EmoticonsZ	Combining )HiraganaKatakanaCJKZHangulzBasic Latin)ro   rn   ZPunctuationZForms)splitr	   )rE   rF   Zkeywords_range_aZkeywords_range_belZrange_a_jp_charsZrange_b_jp_charsr   r   r    rD     sP    


rD   i   皙?F)decoded_sequencemaximum_thresholddebugr   c          	   C   sb  dd t  D }t| d }d}|dk r0d}n|dkr>d}nd	}x|t| d
 t|D ]f\}}x |D ]}	|	|rd|	| qdW |dkr|| dks||d krVtdd |D }||krVP qVW |rXtd}
|
	t
d| d| d|  t| dkr.|
	t
d| dd   |
	t
d| dd   x(|D ] }|
	t
|j d|j  q4W t|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c             S   s   g | ]
}| qS r   r   ).0Zmd_classr   r   r    
<listcomp>  s    zmess_ratio.<locals>.<listcomp>r   g        i       i   rj      
r   c             s   s   | ]}|j V  qd S )N)r$   )rw   dtr   r   r    	<genexpr>%  s    zmess_ratio.<locals>.<genexpr>Zcharset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=rb   zStarting with: NzEnding with: iz:    )r   __subclasses__r[   zipranger!   r"   sumr   logr   	__class__r$   round)rt   ru   rv   Z	detectorslengthZmean_mess_ratioZ!intermediary_mean_mess_ratio_calcr   indexdetectorloggerr|   r   r   r    
mess_ratio  s8    	


 r   N)rs   F)*	functoolsr   loggingr   typingr   r   Zconstantr   r   r	   utilsr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r-   r6   r:   r<   r@   rG   r]   rc   r)   r*   rD   r,   r   r   r   r   r    <module>   s&   H"/%4ZLD