
    '[f                         d Z ddlmZ ddlmZ 	 ddlZn# e$ r dZY nw xY w G d d          Zd Z	e
dk    r e	             dS dS )	a  
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
presented in Cavnar, W. B. and J. M. Trenkle,
"N-Gram-Based Text Categorization".

The algorithm takes advantage of Zipf's law and uses
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.

Language n-grams are provided by the "An Crubadan"
project. A corpus reader was created separately to read
those files.

For details regarding the algorithm, see:
https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf

For details about An Crubadan, see:
https://borel.slu.edu/crubadan/index.html
    )maxsize)trigramsNc                   F    e Zd ZdZi ZdZdZi Zd Zd Z	d Z
d Zd Zd	 ZdS )
TextCatN<>c                     t           st          d          ddlm} || _        | j                                        D ]}| j                            |           d S )Nzclassify.textcat requires the regex module that supports unicode. Try '$ pip install regex' and see https://pypi.python.org/pypi/regex for further details.r   )crubadan)reOSErrornltk.corpusr
   _corpuslangs	lang_freq)selfr
   langs      I/var/www/piapp/venv/lib/python3.11/site-packages/nltk/classify/textcat.py__init__zTextCat.__init__8   s|     	#   	)(((((L&&(( 	) 	)DL""4((((	) 	)    c                 .    t          j        dd|          S )z)Get rid of punctuation except apostrophesz[^\P{P}\']+ )r   subr   texts     r   remove_punctuationzTextCat.remove_punctuationH   s    vnb$///r   c                    ddl m}m} |                     |          } ||          } |            }|D ]L}t	          | j        |z   | j        z             }d |D             }	|	D ]}
|
|v r||
xx         dz  cc<   d||
<   M|S )z'Create FreqDist of trigrams within textr   )FreqDistword_tokenizec                 8    g | ]}d                      |          S )r   )join).0tris     r   
<listcomp>z#TextCat.profile.<locals>.<listcomp>V   s"    KKKsbggcllKKKr      )nltkr   r   r   r   _START_CHAR	_END_CHAR)r   r   r   r   
clean_texttokensfingerprintttoken_trigram_tuplestoken_trigramscur_trigrams              r   profilezTextCat.profileL   s    00000000,,T22
z**hjj 	1 	1A#+D,<q,@4>,Q#R#R KK6JKKKN- 1 1+--,,,1,,,,/0K,,	1 r   c                 J   | j                             |          }d}||v r{t          |                                                              |          }t          |                                                              |          }t          ||z
            }nt          }|S )zgCalculate the "out-of-place" measure between the
        text and language profile for a single trigramr   )r   r   listkeysindexabsr   )r   r   trigramtext_profilelang_fddistidx_lang_profileidx_texts           r   	calc_distzTextCat.calc_dist`   s     ,((..g#GLLNN3399'BBL--//0066w??H '(233DD
 Dr   c                     i }|                      |          }| j        j                                        D ](}d}|D ]}||                     |||          z  }|||<   )|S )zOCalculate the "out-of-place" measure between
        the text and all languagesr   )r/   r   _all_lang_freqr2   r;   )r   r   	distancesr/   r   	lang_distr5   s          r   
lang_distszTextCat.lang_distsu   s     	,,t$$L/4466 	( 	(D I" D DT^^D'7CCC		'IdOOr   c                 v    |                      |          | _        t          | j        | j        j                  S )zYFind the language with the min distance
        to the text and return its ISO 639-3 code)key)r@   last_distancesmingetr   s     r   guess_languagezTextCat.guess_language   s4     #ood334&D,?,CDDDDr   )__name__
__module____qualname__r   fingerprintsr&   r'   rC   r   r   r/   r;   r@   rF    r   r   r   r   /   s        GLKIN) ) ) 0 0 0  (  *  $E E E E Er   r   c            
      6   ddl m}  g d}dddddd	d
ddd	}t                      }|D ]}|                     |          }t	          |          dz
  }t          t          t          |                    }d}t          d|          D ]6}	d}
t          d||	                   D ]}|
d||	         |         z   z  }
||
z  }7t          d|dd         z   dz              |	                    |          }t          d| d||          d           t          d           d S )Nr   )udhr)	zKurdish-UTF8zAbkhaz-UTF8zFarsi_Persian-UTF8z
Hindi-UTF8zHawaiian-UTF8zRussian-UTF8zVietnamese-UTF8zSerbian_Srpski-UTF8zEsperanto-UTF8zNorthern Kurdish	AbkhazianzIranian PersianHindiHawaiianRussian
VietnameseSerbian	Esperanto)	kmrabkpeshinhawrusviesrpepor$   r    zLanguage snippet:    z...zLanguage detection: z ()z############################################################################################################################################)
r   rM   r   sentslenr1   maprangeprintrF   )rM   r   friendlytccur_langraw_sentencesrowscolssampleicur_sentjguesss                r   demorq      s         
 
 
E " 
 
H 
B  

8,,=!!A%C]++,, q$ 	 	AH1d1g&& 6 6C-"21"555hFF 	"VAcE]2U:;;;!!&))@U@@huo@@@AAAi) r   __main__)__doc__sysr   	nltk.utilr   regexr   ImportErrorr   rq   rG   rK   r   r   <module>rx      s    *               	BBB]E ]E ]E ]E ]E ]E ]E ]EB1 1 1h zDFFFFF s    