B
    0d                 @   s6  d dl mZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dlm
Z
 d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlZd dlmZ d dlmZ d dlmZ d dl m!Z!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)Z)d dl*m+Z+ dZ,dZ-e,e- Z.dd Z/dd Z0dd  Z1d!d" Z2d#d$ Z3d%d& Z4ej56d'ee
fd(d) Z7d*d+ Z8d,d- Z9d.d/ Z:d0d1 Z;d2d3 Z<d4d5 Z=d6d7 Z>d8d9 Z?d:d; Z@d<d= ZAd>d? ZBd@dA ZCej5DdBej56dCdDdEgdFdG ZEdHdI ZFdJdK ZGdLdM ZHdNdO ZIdPdQ ZJdRdS ZKdTdU ZLdVdW ZMe"dXdY ZNej5DdBej56dCdDdEgdZd[ ZOej56d'eefd\d] ZPej5DdBej56dCdDdEgd^d_ ZQd`da ZRdbdc ZSej56dddedfieTdgfdhdiieTdjfdedkieTdlfdhdmieTdnfdodfdpeTdgfdidqdpeTdjfdrdmieTdsfdrdtieUduffdvdw ZVej5DdBej56dCdDdEgdxdy ZWe"dzd{ ZXej56d'eefd|d} ZYd~d ZZdd Z[dd Z\e"dd Z]dd Z^dd Z_ej56dej`ejaejbgdd Zcej5DdBej56dCdDdEgdd Zdej5DdBej56dCdDdEgdd Zedd Zfdd Zgdd Zhdd Zidd Zjdd Zke"dd Zldd Zmdd Zndd Zoej56d'eee
fdd Zpej56dejqejrgdd Zsdd Ztej56dejuejrdfejvejrdfejqejqdfejrejrdfgdd Zwej56de
ddeddeddgdd Zxdd Zye"dd Zze$dd Z{e"ej56deee
gdd Z|ej56deee
gej56dde}dfde~dfgddń Zej56deeeje
e"dƍgej56dddɄ ddɄ gej56dddgdd̈́ Zej56deee
gddτ Zej56d'ee
egej56dddgddddddddf	dddɄ dddddddf	dddɄ dddddddf	ddddɄ ddddɄ dddf	dddddddɄ dddf	dgdd Zej56de
ddddoddgfee,ffdd Zdd Zdd Ze"dd ZdS )    )MappingN)sparse)
strip_tags)strip_accents_unicode)strip_accents_ascii)HashingVectorizer)CountVectorizer)TfidfTransformer)TfidfVectorizer)ENGLISH_STOP_WORDS)train_test_split)cross_val_score)GridSearchCV)Pipeline)	LinearSVC)clone)assert_array_almost_equal)assert_array_equal)IS_PYPY)assert_almost_equalfails_if_pypyassert_allclose_dense_sparseskip_if_32bit)defaultdict)partial)StringIO)zthe pizza pizza beer copyrightzthe pizza burger beer copyrightz!the the pizza beer beer copyrightzthe burger beer beer copyrightzthe coke burger coke copyrightzthe coke burger burger)zthe salad celeri copyrightz)the salad salad sparkling water copyrightzthe the celeri celeri copyrightzthe tomato tomato salad waterz the tomato salad water copyrightc             C   s   t |  S )N)r   upper)s r   \/var/www/html/venv/lib/python3.7/site-packages/sklearn/feature_extraction/tests/test_text.py	uppercase>   s    r    c             C   s   |  ddS )N   ée)replace)r   r   r   r   strip_eacuteB   s    r$   c             C   s   |   S )N)split)r   r   r   r   split_tokenizeF   s    r&   c             C   s   dgS )NZthe_ultimate_featurer   )r   r   r   r   lazy_analyzeJ   s    r'   c              C   s   d} d}t | |kstd} d}t | |ks0td} d}t | |ksHtd} d}t | |ks`td	} d
}t | |ksxtd} d}t | |kstd} d
}t | |kstd S )Nu   àáâãäåçèéêëaaaaaaceeeeu   ìíîïñòóôõöùúûüýiiiinooooouuuuyu   إu   اu   this is à testzthis is a testu   öou   ̀́̂̃ u   ȫ)r   AssertionError)aexpectedr   r   r   test_strip_accentsN   s*    r/   c              C   sd   d} d}t | |kstd} d}t | |ks0td} d}t | |ksHtd} d}t | |ks`td S )	Nu   àáâãäåçèéêër(   u   ìíîïñòóôõöùúûüýr)   u   إr+   u   this is à testzthis is a test)r   r,   )r-   r.   r   r   r   test_to_asciir   s    r0   
Vectorizerc          
   C   s  | dd  }d}dddddd	d
dddg
}|||ks:td}dddddddg}|||ks`t| dd  }td}dddddddg}|||kst| td  }d}ddd d!d"d#d$d%d&d'g
}|||kst| tdd(  }d}d)ddddd*d+ddd,g
}|||kstd S )-Nascii)strip_accentsu:   J'ai mangé du kangourou  ce midi, c'était pas très bon.aimangedu	kangouroucemidietaitpastresbonz0This is a test, really.

 I met Harry yesterday.thisistestZreallyZmetZharryZ	yesterdayfile)inputz'This is a test with a file-like object!withlikeobject)preprocessoru;   J'ai mangé du kangourou  ce midi,  c'était pas très bon.ZAIZMANGEZDUZ	KANGOUROUZCEZMIDIZETAITZPASZTRESZBON)	tokenizerr3   zj'aizmidi,zc'etaitzbon.)build_analyzerr,   r   r    r&   )r1   watextr.   r   r   r   test_word_analyzer_unigrams   s\    rK   c              C   sT   t dddd } d}dddd	d
ddddddddddddddg}| ||ksPtd S )Nwordunicode)      )analyzerr3   ngram_rangeu:   J'ai mangé du kangourou  ce midi, c'était pas très bon.r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   zai mangezmange duzdu kangourouzkangourou cezce midiz
midi etaitz	etait paszpas tresztres bon)r   rH   r,   )rI   rJ   r.   r   r   r   'test_word_analyzer_unigrams_and_bigrams   s.    rR   c           	   C   sp   d} |  d}tddd }tt || W d Q R X tdddd }tt || W d Q R X d S )	Nu:   J'ai mangé du kangourou  ce midi, c'était pas très bon.zutf-8)rN   rO   r2   )rQ   encodingchar)      )rP   rQ   rS   )encoder   rH   pytestraisesUnicodeDecodeError)rJ   Z
text_bytesrI   car   r   r   test_unicode_decode_error   s    
r\   c              C   s   t dddd } d}dddd	d
g}| |d d |ks<tdddddg}| |dd  |ksbtd}dddddg}| |d d |kstdddddg}| |dd  |kstt dddd } td}dddddg}| |d d |kstd S ) NrT   rM   )rU   rV   )rP   r3   rQ   u9   J'ai mangé du kangourou  ce midi, c'était pas très bonzj'az'aizai zi mz ma   zs tresz tres ztres bzres bozes bonz1This 
	is a test, really.

 I met Harry yesterdaythihiszis zs iz isz yesteyesteresterdsterdaterdayrA   )rB   rP   rQ   z'This is a test with a file-like object!)r   rH   r,   r   )cngarJ   r.   r   r   r   test_char_ngram_analyzer   s"    rf   c              C   s   t dddd } d}dddd	d
g}| |d d |ks<tdddddg}| |dd  |ksbtt dddd } td}ddddddg}| |d d |kstd S )NZchar_wbrM   )rU   rV   )rP   r3   rQ   z1This 
	is a test, really.

 I met Harry yesterdayz thr_   r`   zis z thir]   ra   rb   rc   rd   zerday r^   rA   )rB   rP   rQ   zA test with a file-like object!z a z teZtesZestzst z tesrV   )r   rH   r,   r   )re   rJ   r.   r   r   r   test_char_wb_ngram_analyzer  s    rg   c              C   s   t dddd } d}dddg}| |d d	 |ks8td
ddg}| |dd  |ksZtt dddd }t|}||| |kstd S )NrL   rM   )rU   rV   )rP   r3   rQ   z1This 
	is a test, really.

 I met Harry yesterdayzthis is testzis test reallyztest really metrU   ztest really met harry yesterdayzthis is test really met harryz"is test really met harry yesterdayrA   )rB   rP   rQ   )r   rH   r,   r   )re   rJ   r.   Z	cnga_filerA   r   r   r   test_word_ngram_analyzer$  s    
ri   c              C   s   ddd} t |  }xttttttgD ]}|| }t|d}|	t
 t|trd|j| ksvtnt |j|ksvt|t
}|jd t|kst|| }t|d}||}t||jd ks*tq*W d S )Nr   rN   )pizzabeer)
vocabulary)setkeysdictlistiterr   r   intr   fitJUNK_FOOD_DOCS
isinstancer   vocabulary_r,   	transformshapeleninverse_transform)vocabtermstypvvectXinvr   r   r   &test_countvectorizer_custom_vocabulary;  s    






r   c              C   sd   ddg} t dt| dfdt fg}|t}t|jd jt| ksJt|j	d t
| ks`td S )Nrj   rk   count)rl   tfidfrN   )r   r   r	   fit_transformALL_FOOD_DOCSrm   named_stepsrv   r,   rx   ry   )Zwhat_we_likepiper   r   r   r   /test_countvectorizer_custom_vocabulary_pipelineP  s    
r   c           	   C   sB   ddd} d}t jt|d t| d}|dg W d Q R X d S )Nr   )rj   rk   z$Vocabulary contains repeated indices)match)rl   Zpasta_siziliana)rX   rY   
ValueErrorr   rs   )r{   msgr   r   r   r   7test_countvectorizer_custom_vocabulary_repeated_indices]  s
    

r   c           	   C   s>   ddd} t jtdd t| d}|dg W d Q R X d S )NrN   rO   )rj   rk   zdoesn't contain index)r   )rl   Zpasta_verdura)rX   rY   r   r   rs   )r{   r   r   r   r   0test_countvectorizer_custom_vocabulary_gap_indexe  s    

r   c           	   C   s   t  } | jdd |  tks"t| jdd tt |   W d Q R X | jdd tt |   W d Q R X dddg}| j|d |  t|kstd S )Nenglish)
stop_wordsZ_bad_str_stop_Z_bad_unicode_stop_Zsomeotherwords)	r   
set_paramsget_stop_wordsr   r,   rX   rY   r   rm   )cvZstoplistr   r   r   test_countvectorizer_stop_wordsl  s    
r   c           	   C   sj   t jtdd tg d} | dg W d Q R X t jtdd" tddd}|dd	d
g W d Q R X d S )Nzempty vocabulary)r   )rl   foog      ?r   )max_dfr   zto be or not to bez
and me toozand so do you)rX   rY   r   r   rs   )r   r~   r   r   r   %test_countvectorizer_empty_vocabulary{  s    
r   c              C   sF   t  } | td d }| tdd  }|jd |jd ksBtd S )Nr]   rN   )r   r   r   rx   r,   )r   ZX1X2r   r   r   test_fit_countvectorizer_twice  s    r   zignore::FutureWarning:sklearn	get_namesget_feature_namesget_feature_names_outc             C   sH   ddddg}d}t |d}|| ddd	g}t||  }t|| d
S )zCheck `get_feature_names()` when a custom token pattern is passed.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    z&This is the 1st document in my corpus.z This document is the 2nd sample.zAnd this is the 3rd one.zIs this the 4th document?z'[0-9]{1,3}(?:st|nd|rd|th)\s\b(\w{2,})\b)token_patterndocumentonesampleN)r   r   getattrr   )r   corpusr   
vectorizerr.   feature_names_outr   r   r   )test_countvectorizer_custom_token_pattern  s    


r   c           	   C   sF   ddddg} d}d}t |d}tjt|d ||  W d	Q R X d	S )
zCheck that we raise an error if token pattern capture several groups.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    z&This is the 1st document in my corpus.z This document is the 2nd sample.zAnd this is the 3rd one.zIs this the 4th document?z)([0-9]{1,3}(?:st|nd|rd|th))\s\b(\w{2,})\bz,More than 1 capturing group in token pattern)r   )r   N)r   rX   rY   r   rs   )r   r   err_msgr   r   r   r   <test_countvectorizer_custom_token_pattern_with_several_group  s    
r   c           	   C   sl   ddddg} d}t d| d}tjt|d ||  W d Q R X td }||  W d Q R X |rhtd S )	NZSampleZUpperZCaseZ
VocabularyzyUpper case characters found in vocabulary while 'lowercase' is True. These entries will not be matched with any documentsT)	lowercaserl   )r   )r   rX   warnsUserWarningrs   rw   r,   )rl   messager   recordr   r   r   'test_countvectorizer_uppercase_in_vocab  s    r   c              C   sP   dddgdddgdddgg} t ddd| }dddg}||}t|| d	S )
z0Check get_feature_names_out for TfidfTransformerrN   r   Tl2)
smooth_idfnormr-   cbN)r	   rs   r   r   )r   trZfeature_names_inr   r   r   r   %test_tf_transformer_feature_names_out  s
    

r   c              C   s   dddgdddgdddgg} t ddd}||  }|dk sFtt|d jdddddg dddgdddgdddgg} t ddd}||  }|dk std S )	NrN   r   Tr   )r   r   rO   )axisg      ?)r	   r   toarrayallr,   r   sum)r   r   r   r   r   r   test_tf_idf_smoothing  s    r   c           	   C   s   dddgdddgdddgg} t ddd}||  }|dk sFtt|d jdddddg dddgdddgdddgg} t ddd}d	}tjt	|d
 ||   W d Q R X d S )NrN   r   Fr   )r   r   rO   )r   g      ?zdivide by zero)r   )
r	   r   r   r   r,   r   r   rX   r   RuntimeWarning)r   r   r   Zin_warning_messager   r   r   test_tfidf_no_smoothing  s    r   c              C   s   dgdgdgg} t ddd d}||  }|d dks<t|d |d ksPt|d |d ksdt|d dk stt|d dk std S )NrN   rO   rU   TF)sublinear_tfuse_idfr   r   )r	   r   r   r,   )r   r   r   r   r   r   test_sublinear_tf  s    r   c           	   C   st  t td d } td g}ttd }tdd}|| }t|drL| }|d|jd f dksftt|jd	}x||fD ]}|	|}t|dr| }|j}|d|d
 f dkst|d|d f dkst|d|d f dkstd|kstd|kst|d|d f dkst|d|d f dks8t|d|d f dksRt|d|d f dks|tq|W t
dd}	|	|	| }
t|	jt|jkst|
j|t|jfkst|		| }|jt|t|jfkstt
ddd}||	| }t|drtt
dd}tt |	| W d Q R X ttj|dddg|  t td d } tdd}|j|_||  }|jrtt|
| |	| }t|| td d	}tt |	|  W d Q R X |jddd | }d}t|}||}||ks t|jdd d tt |  W d Q R X d |_tt |  W d Q R X d S )!NrN   g      ?)r   tocsrr   rj   rO   )rl   saladtomatowaterthe	copyrightcokeburgerrk   l1)r   F)r   r   idf_T)r   )r   g      ?r2   )r3   r   u:   J'ai mangé du kangourou  ce midi, c'était pas très bon.Z_gabbledegook_)r3   rF   Z_invalid_analyzer_type_)rq   r   ry   r   r   hasattrr   rv   r,   rw   r	   rs   r   r   rx   rX   rY   r   r   npr   r
   r   fixed_vocabulary_r   build_preprocessorr   rH   )
train_data	test_dataZn_trainZv1Zcounts_trainZv2r~   Zcounts_testrl   t1r   Z
tfidf_testt2tft3tvZtfidf2Ztfidf_test2Zv3	processorrJ   r.   resultr   r   r   test_vectorizer  sv    











r   c              C   s`   t ddddd} d| _| jjdks&td| _| jjs8td| _| jjsJtd| _| jjs\td S )Nr   F)r   r   r   r   r   T)r
   r   _tfidfr,   r   r   r   )r   r   r   r   test_tfidf_vectorizer_settersj  s    r   c              C   s  t  } | t}|j}|jtt| jfks.t|j| jks>tt	
|jdksRtt	
|jdk sftt	|jdksztt	|jdk stx0t|jd D ]}tt	j|d jdd qW t ddd} | t}|jtt| jfkst|j| jkst|j}||kst|d| k s$tt	
|jdks:tt	|jdk sPtx2t|jd D ] }tt	j|d jdd q`W d S )	Nr   r   rN   rO   g      ?)rN   rO   r   )rQ   r   )r   rw   r   nnzrx   ry   
n_featuresr,   dtyper   mindatamaxranger   Zlinalgr   )r~   r   Z	token_nnziZ
ngrams_nnzr   r   r   test_hashing_vectorizerv  s,    

r   c       	   
   C   sz  t dd}tt t||   W d Q R X |jr6t|t}|j	\}}t
|j|ks\tt||  }| dkrt|tjst|jtkstnt|tstt
||ksttdddddd	d
ddg	| x(t|D ]\}}||j|kstqW dddddd	d
ddg	}t |d}t||  }tdddddd	d
ddg	| |jsHtx,t|D ] \}}||j|ksRtqRW d S )Ng      ?)r   r   rk   r   celerir   rj   r   	sparklingr   r   )rl   )r   rX   rY   r   r   r   r,   r   r   rx   ry   rv   ru   r   Zndarrayr   rE   rp   r   	enumerateget)	r   r   r   Z	n_samplesr   Zfeature_namesidxnamer{   r   r   r   test_feature_names  sf    




r   c             C   sX   ddddh}ddddd	d
dh}| ddd}| t t|j|ksFt|j|ksTtd S )Nr   rk   r   rj   r   r   r   r   r   r   r   g333333?   )r   max_features)rs   r   rm   rv   r,   stop_words_)r1   Zexpected_vocabularyZexpected_stop_wordsr   r   r   r   test_vectorizer_max_features  s    
r   c       
      C   s   t dd}t dd}t d d}|tjdd}|tjdd}|tjdd}t||  }t||  }t||  }	d| kstd| kstd| kstd|t| kstd|t| kstd|	t| kstd S )NrN   )r   rU   r   )r      r   )	r   r   rt   r   r   r   r,   r   Zargmax)
r   Zcv_1Zcv_3Zcv_NoneZcounts_1Zcounts_3Zcounts_NoneZ
features_1Z
features_3Zfeatures_Noner   r   r   "test_count_vectorizer_max_features   s    


r   c              C   s  dddg} t ddd}||  d|j ks2tt|j dksHtt|jd	ksZtd
|_||  d|j ks|tt|j dkstd|jkstt|jdkstd|_||  d|j kstt|j dkstd|jkstt|jdkstd S )NabcdeaeatrT   g      ?)rP   r   r-   rV   r   g      ?r   rO   rN   )r   rs   rv   rn   r,   ry   r   r   )r   r   r   r   r   test_vectorizer_max_df  s$    



r   c              C   s  dddg} t ddd}||  d|j ks2tt|j dksHtt|jd	ksZtd
|_||  d|j ks|tt|j d
kstd|jkstt|jdkstd|_||  d|j kstt|j dkstd|jkstt|jdkstd S )Nr   r   r   rT   rN   )rP   min_dfr-   rV   r   rO   r   r   g?r]   )r   rs   rv   rn   r,   ry   r   r   )r   r   r   r   r   test_vectorizer_min_df3  s$    



r   zparams, err_type, messager   g       @zmax_df == 2.0, must be <= 1.0.r   g      ?zmin_df == 1.5, must be <= 1.0.zmax_df == -2, must be >= 0.izmin_df == -10, must be >= 0.rU   )r   r   2   r   z"max_features == -10, must be >= 0.g      @zSmax_features must be an instance of <class 'numbers.Integral'>, not <class 'float'>c          	   C   sD   t j||d, dddg}tf | ddi}|| W d Q R X d S )N)r   r   r   r   rP   rT   )rX   rY   r   rs   )paramserr_typer   r   r   r   r   r   !test_vectorizer_params_validationJ  s    
r   c             C   s   ddg}t ddd}|| }tdddd	d
gt||   tdddddgdddddgg| t dddd}|| }tdddddgdddddgg| t dddtjd}||}|jtjkstd S )NaaabcabbderT   g      ?)rP   r   r-   r   r   dr"   rU   rN   r   rO   T)rP   r   binary)rP   r   r   r   )	r   r   r   r   r   r   float32r   r,   )r   r   r   r   ZX_sparser   r   r   test_count_binary_occurrencesd  s    ""
r  c              C   s   ddg} t ddd d}|| }t|dd jdks<tt|dd	 jd	ksXt|jtjkshtt ddd
d d}|| }t|jdkst|jtjkstt ddd
d tjd}|| }|jtjkstd S )Nr   r   FrT   )alternate_signrP   r   r   rN   rU   rO   T)rP   r  r   r   )rP   r  r   r   r   )r   rw   r   r   r   r,   r   float64)r   r   r   r   r   r   test_hashed_binary_occurrencesz  s    


r  c             C   s  t }|  }||}||}t|ts,t| }xDt||D ]6\}}t	t
||}t	t
|}t|| q@W t|st|jdkst| }	||	}
x,t||
D ]\}}tt	|t	| qW | }||}x,t||D ]\}}tt	|t	| qW d S )NZcsr)r   r   rz   ru   rp   r,   rH   zipr   sortuniquer   r   issparseformatr   Ztocsc)r1   r   r   Ztransformed_dataZinversed_dataZanalyzedocZinversed_termsr|   Ztransformed_data2Zinversed_data2Zterms2Ztransformed_data3Zinversed_data3Zterms3r   r   r   !test_vectorizer_inverse_transform  s(    



r  c              C   s   t t } dgtt  dgtt  }t| |ddd\}}}}tdt fdt fg}dd	gd
d}t||ddd}|||	|}	t
|	| |jdkst|jjd }
|
jdkstd S )Nr   rN   g?r   )	test_sizerandom_stater   svc)rN   rN   )rN   rO   )hingesquared_hinge)vect__ngram_range	svc__lossrU   )n_jobsr   g      ?)rt   NOTJUNK_FOOD_DOCSry   r   r   r   r   r   rs   predictr   best_score_r,   best_estimator_r   rQ   )r   targetr   r   target_traintarget_testpipeline
parametersgrid_searchpredbest_vectorizerr   r   r   -test_count_vectorizer_pipeline_grid_selection  s    
r!  c              C   s   t t } dgtt  dgtt  }t| |ddd\}}}}tdt fdt fg}dd	gd
dd}t||dd}|||	|}	t
|	| |jdkst|jjd }
|
jdkst|
jdkst|
jrtd S )Nr   rN   g?r   )r  r  r   r  )rN   rN   )rN   rO   )r   r   )r  r  )r  Z
vect__normr  )r  g      ?r   )rt   r  ry   r   r   r
   r   r   rs   r  r   r  r,   r  r   rQ   r   r   )r   r  r   r   r  r  r  r  r  r  r   r   r   r   'test_vectorizer_pipeline_grid_selection  s     
r"  c              C   s`   t t } dgtt  dgtt  }tdt fdt fg}t|| |dd}t|dddg d S )Nr   rN   r   r  rU   )r   g      ?)rt   r  ry   r   r
   r   r   r   )r   r  r  Z	cv_scoresr   r   r   )test_vectorizer_pipeline_cross_validation   s
    r#  c              C   sx   d} t  }|| g}|jdks$ttd dd}|| g}|jdksJt|j|jksZttt	|j
t	|j
 d S )Nu   Машинное обучение — обширный подраздел искусственного интеллекта, изучающий методы построения алгоритмов, способных обучаться.)rN      F)r   r  )rN   i   )r   r   rx   r,   r   rw   r   r   r   r  r   )r   r   Z	X_countedZX_hashedr   r   r   test_vectorizer_unicode  s    r%  c              C   sF   ddg} t | d}|t}|t}t| |  |jsBtd S )Nrj   r   )rl   )r
   r   r   rw   r   r   r   r,   )rl   r   ZX_1ZX_2r   r   r   +test_tfidf_vectorizer_with_fixed_vocabulary&  s    


r&  c              C   s   t  t ddt ddt ddt ttdttdttdtttd	tt ttdt tg} xn| D ]f}t	|}t
|}t||jkst| | ksttrt|t rqlqlt|t|t qlW d S )
Nr   )r   T)r   )rN   rO   )rQ   )rF   )rP   )r3   )r   r   r   r'   rs   rt   r$   r
   pickledumpsloadstype	__class__r,   
get_paramsr   ru   r   r   )Z	instancesorigr   copyr   r   r   test_pickling_vectorizer0  s,    


r/  factoryc             C   sB   t  }| |}d}tt|}||}||}||ks>tdS )z_Tokenizers cannot be pickled
    https://github.com/scikit-learn/scikit-learn/issues/12833
    u:   J'ai mangé du kangourou  ce midi, c'était pas très bon.N)r   r'  r)  r(  r,   )r0  vecfunctionrJ   Zroundtripped_functionr.   r   r   r   r   test_pickling_built_processorsN  s    r3  c             C   s   t jd}t dddddddd	d
g	}xntddD ]`}t|j|ddd}t|d}t	t
|}|t |t tt||  t||   q4W d S )Nr   rk   r   r   r   rj   r   r   r   r   d   r]   F)sizer#   )rl   )r   randomRandomStatearrayr   rm   choicer   r'  r)  r(  rs   r   r   r   )r   rngvocab_wordsxZ	vocab_setr   unpickled_cvr   r   r   -test_countvectorizer_vocab_sets_when_picklingd  s$    


r>  c       	      C   s   t jd}t dddddddd	d
g	}xtddD ]}t }|j|ddd}xtddD ]}|||| < qZW t|d}t	t
|}|t |t tt||  t||   q4W d S )Nr   rk   r   r   r   rj   r   r   r   r   r4  r]   F)r5  r#   )rl   )r   r6  r7  r8  r   ro   r9  r   r'  r)  r(  rs   r   r   r   )	r   r:  r;  r<  Z
vocab_dictr   yr   r=  r   r   r   .test_countvectorizer_vocab_dicts_when_pickling  s*    


r@  c              C   s   t  tttdtttdtf} xZ| D ]R}|t }d |_|t }t	|d |t }t
|| t
|| q0W d S )N)rF   )r3   r   )r
   rs   rt   r   r   r$   rw   r   r   delattrr   )Zfitted_vectorizersr   Zvect_transformZstop_None_transformZstop_del_transformr   r   r   test_stop_words_removal  s    



rB  c              C   s`   t  t} t | }t|}t|}t||j	ks>t
t||  ||   d S )N)r   r   rt   r	   rs   r'  r(  r)  r*  r+  r,   r   r   )r   r-  r   r.  r   r   r   test_pickling_transformer  s    

rC  c              C   sH   t  t} t | }t }|j|_t||  ||   d S )N)	r   r   rt   r	   rs   r   r   rw   r   )r   r-  r.  r   r   r   test_transformer_idf_setter  s
    rD  c              C   sL   t dd} | t t | jdd}| j|_t|t | t  d S )NT)r   )rl   r   )r
   rs   rt   rv   r   r   rw   r   )r-  r.  r   r   r   test_tfidf_vectorizer_setter  s    

rE  c           	   C   s`   t dd} | t t | jdd}t| j}dg|d  }tt t	|d| W d Q R X d S )NT)r   )rl   r   g      ?rN   r   )
r
   rs   rt   rv   ry   r   rX   rY   r   setattr)r   r.  Zexpected_idf_lenZinvalid_idfr   r   r   %test_tfidfvectorizer_invalid_idf_attr  s    


rG  c           	   C   s<   dddddg} t | d}tt |g  W d Q R X d S )Nr-   r   r   )rl   )r   rX   rY   r   rs   )r{   r   r   r   r   test_non_unique_vocab  s    
rH  c           	   C   s4   d} t }dd }tj|| d |  W d Q R X d S )Nz?np.nan is an invalid document, expected byte or unicode string.c              S   s   t  } | dtjdg d S )Nzhello worldzhello hello)r   r   r   nan)Zhvr   r   r   func  s    z0test_hashingvectorizer_nan_in_docs.<locals>.func)r   )r   rX   rY   )r   	exceptionrJ  r   r   r   "test_hashingvectorizer_nan_in_docs  s
    rL  c              C   sl   t ddd d} | jst| ddg }t| ddddg | ddg }t| ddddg d S )NTF)r   r   r   zhello worldzhello hellorN   r   )r
   r   r,   r   r   r   Zravelrw   )r~   r   r   r   r   r   test_tfidfvectorizer_binary  s    
rM  c              C   s(   t dd} | t t| j| jj d S )NT)r   )r
   rs   rt   r   r   r   )r   r   r   r   test_tfidfvectorizer_export_idf  s    

rN  c              C   s<   t dgd} t| }| t |t |j| jks8td S )Nr   )rl   )r
   r   rs   r   rv   r,   )Z
vect_vocabZvect_vocab_cloner   r   r   test_vectorizer_vocab_clone  s
    

rO  c          	   C   s   d}|  }t jt|d |d W d Q R X t jt|d |d W d Q R X |ddg t jt|d |d W d Q R X d S )NzBIterable over raw text documents expected, string object received.)r   zhello world!z	some textzsome other text)rX   rY   r   r   rs   rw   )r1   r   r1  r   r   r   &test_vectorizer_string_object_as_input	  s    rP  X_dtypec             C   s2   t jdd| dd}t |}|j|jks.td S )N
   i N  *   )r   r  )r   randr	   r   r   r,   )rQ  r   ZX_transr   r   r   test_tfidf_transformer_type  s    rU  c              C   s^   t jddtjdd} t | }t | }t |}t |}t|| |j	|j	ksZt
d S )NrR  i N  rS  )r   r  )r   rT  r   r  Z
csc_matrix
csr_matrixr	   r   r   r
  r,   )r   ZX_cscZX_csrZX_trans_cscZX_trans_csrr   r   r   test_tfidf_transformer_sparse"  s    


rW  z0vectorizer_dtype, output_dtype, warning_expectedTFc       
   	      s   t dddg}t| d}d}t |r* nd }tj||d}||}W d Q R X |d kr| fdd|D }	t|	d	ks|t|j	|kstd S )
NnumpyscipyZsklearn)r   z'dtype' should be used.)r   c                s   g | ]}t | r|qS r   )ru   ).0w)warning_clsr   r   
<listcomp>@  s    z.test_tfidf_vectorizer_type.<locals>.<listcomp>r   )
r   r8  r
   r   rX   r   r   ry   r,   r   )
Zvectorizer_dtypeZoutput_dtypeZwarning_expectedr   r   Zwarning_msg_matchZexpected_warning_clsr   ZX_idfZrelevant_warningsr   )r\  r   test_tfidf_vectorizer_type-  s    

r^  r1  )rO   rN   )rQ   c          	   C   s   | j }td| d}t| tr2tr2tjdd tjt	|d | 
dg W d Q R X tjt	|d | dg W d Q R X t| trtjt	|d | dg W d Q R X d S )NzInvalid value for ngram_range=z/ lower boundary larger than the upper boundary.z*HashingVectorizer is not supported on PyPy)reason)r   zgood news everyone)rQ   reescaperu   r   r   rX   xfailrY   r   rs   r   rw   )r1  Zinvalid_ranger   r   r   r   $test_vectorizers_invalid_ngram_rangeE  s    
rc  c             C   s&   |   }|  }|  }| |||S )N)r   build_tokenizerr   _check_stop_words_consistency)Z	estimatorr   tokenize
preprocessr   r   r   re  c  s    re  c           
   C   s   d} d|  }xft  t t gD ]R}|jddddgd tjt|d |d	g W d Q R X |`t	|d
kst
qW td }|d	g W d Q R X t|rt
t	|d kst
|jdddddgd tjt|d |d	g W d Q R X d S )Nz\['and', 'll', 've'\]z}Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens %s not in stop_words.zyou'veyouzyou'llAND)r   )r   zhello worldFZblah)r   r
   r   r   rX   r   r   r   Z_stop_words_idre  r,   ry   )Zlstrr   r1  r   r   r   r   'test_vectorizer_stop_words_inconsistentj  s     rj  c              C   s`   t jdtjd} tj}| j|| _| j|| _dddd}t | |}||jj	ks\t
dS )z
    Check that CountVectorizer._sort_features preserves the dtype of its sparse
    feature matrix.

    This test is skipped on 32bit platforms, see:
        https://github.com/scikit-learn/scikit-learn/pull/11295
    for more details.
    )r]   r]   )r   r   rN   rO   )zscikit-learnr?   zgreat!N)r   rV  r   int64indicesZastypeZindptrr   Z_sort_featuresr   r,   )r   ZINDICES_DTYPErl   ZXsr   r   r   7test_countvectorizer_sort_features_64bit_sparse_indices  s    rm  	Estimatorc             C   s   ddig}|  }t |dks t| dd dgd}t |dksBtt |d ksRt|| G d	d
 d
| }|dgd}t |dkst| dd dgd}t |dkstd S )NrJ   z	some textTc             S   s   | d S )NrJ   r   )r<  r   r   r   <lambda>      z?test_stop_word_validation_custom_preprocessor.<locals>.<lambda>and)rF   r   errorc               @   s   e Zd Zdd ZdS )zFtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimatorc             S   s   dd S )Nc             S   s   | d S )NrJ   r   )r<  r   r   r   ro    rp  zktest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessor.<locals>.<lambda>r   )selfr   r   r   r     s    zYtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessorN)__name__
__module____qualname__r   r   r   r   r   CustomEstimator  s   rw  )r   c             S   s   t d| S )Nz\w{1,})r`  compilefindall)r  r   r   r   ro    rp  )rG   r   )re  r,   r   )rn  r   r1  rw  r   r   r   -test_stop_word_validation_custom_preprocessor  s    

rz  zinput_type, err_type, err_msgfilenamer+   rA   z$'str' object has no attribute 'read'c          	   C   sR   t | trtrtd dg}tj||d | dd |d| W d Q R X d S )Nz*HashingVectorizer is not supported on PyPyz"this is text, not file or filename)r   c             S   s   |   S )N)r%   )r<  r   r   r   ro    rp  z.test_callable_analyzer_error.<locals>.<lambda>)rP   rB   )
issubclassr   r   rX   rb  rY   r   )rn  
input_typer   r   r   r   r   r   test_callable_analyzer_error  s
    
r~  )ZmarksrP   c             C   s
   t | dS )Nr)open)r  r   r   r   ro    rp  ro  c             C   s   |   S )N)read)r  r   r   r   ro    rp  r}  c          	   C   s6   dg}t ttf | ||d| W d Q R X d S )Nz"this is text, not file or filename)rP   rB   )rX   rY   FileNotFoundErrorAttributeErrorr   )rn  rP   r}  r   r   r   r   &test_callable_analyzer_change_behavior  s    r  c          	   C   sf   dd }t |tr tr td | d}|d tjtdd ||dd		|g W d Q R X d S )
Nc             S   s   t dd S )Ntesting)	Exception)r  r   r   r   rP     s    z6test_callable_analyzer_reraise_error.<locals>.analyzerz*HashingVectorizer is not supported on PyPyzfile.txtzsample content
r  )r   rA   )rP   rB   )
r|  r   r   rX   rb  joinwriterY   r  r   )Ztmpdirrn  rP   fr   r   r   $test_callable_analyzer_reraise_error  s    


r  zjstop_words, tokenizer, preprocessor, ngram_range, token_pattern,analyzer, unused_name, ovrd_name, ovrd_msgzyou'vezyou'll)rN   rN   rT   z'stop_words'z
'analyzer'z	!= 'word'c             C   s   |   S )N)r%   )r   r   r   r   ro    rp  z'tokenizer'c             C   s   |   S )N)r%   )r   r   r   r   ro    rp  z\w+rL   z'token_pattern'zis not Nonec             C   s   |   S )N)r   )r   r   r   r   ro    rp  c             C   s   |   S )N)r   )r   r   r   r   ro     rp  z'preprocessor'zis callable)rN   rO   c             C   s   |   S )N)r   )r   r   r   r   ro  +  rp  z'ngram_range')	NNN)rN   rN   z\w+rT   z'token_pattern'z
'analyzer'z	!= 'word'c
          	   C   sV   t }
|  }|j||||||d d|||	f }tjt|d ||
 W d Q R X d S )N)r   rG   rF   rQ   r   rP   z-The parameter %s will not be used since %s %s)r   )rt   r   rX   r   r   rs   )r1   r   rG   rF   rQ   r   rP   Zunused_nameZ	ovrd_nameZovrd_msgr   r   r   r   r   r   test_unused_parameters_warn  s    Xr  zVectorizer, XrN   rO   )r   bar)r   Zbazc             C   s0   |  }t |drt|| t |dr,td S )NZn_features_in_)r   r,   rs   )r1   r   r   r   r   r   test_n_features_in^  s    	
r  c              C   s:   t dd} | ddgj}| ddgj}||ks6td S )NrN   )r   ZhelloZworld)r   rs   rv   r,   )r1  Zvocab1Zvocab2r   r   r   )test_tie_breaking_sample_order_invariancem  s    
r  c           	   C   s:   t ddt} d}tjt|d |   W d Q R X d S )Ng      ?)r   z&get_feature_names is deprecated in 1.0)r   )r   rs   r   rX   r   FutureWarningr   )r   r   r   r   r   !test_get_feature_names_deprecatedw  s    r  c              C   s.   t ddd} | dgj}|d dks*td S )Ni@B )rO   rU   )r   rQ   z22pcs efuturer   )r   rw   rl  r,   )Zhashingrl  r   r   r   2test_nonnegative_hashing_vectorizer_result_indices~  s    r  )collections.abcr   r`  rX   rY  r   Zsklearn.feature_extraction.textr   r   r   r   r   r	   r
   r   Zsklearn.model_selectionr   r   r   Zsklearn.pipeliner   Zsklearn.svmr   Zsklearn.baser   rX  r   Znumpy.testingr   r   Zsklearn.utilsr   Zsklearn.utils._testingr   r   r   r   collectionsr   	functoolsr   r'  ior   rt   r  r   r    r$   r&   r'   r/   r0   markZparametrizerK   rR   r\   rf   rg   ri   r   r   r   r   r   r   r   filterwarningsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	TypeErrorr   r  r  r  r!  r"  r#  r%  r&  r/  rH   r   rd  r3  r>  r@  rB  rC  rD  rE  rG  rH  rL  rM  rN  rO  rP  r  r  rU  rW  Zint32rk  r^  rc  re  rj  rm  rz  r  r  r~  paramr  r  r  r  r  r  r  r   r   r   r   <module>   s  	$>

g(
L

 $'


	
-
