
    '[fV!                     N    d dl Z d dlmZ d dlmZ d dlmZ  G d de          ZdS )    N)warn)ElementTree)CorpusReaderc                   X     e Zd ZdZ fdZd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Z xZS )BCP47CorpusReaderu~  
    Parse BCP-47 composite language tags

    Supports all the main subtags, and the 'u-sd' extension:

    >>> from nltk.corpus import bcp47
    >>> bcp47.name('oc-gascon-u-sd-fr64')
    'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'

    Can load a conversion table to Wikidata Q-codes:
    >>> bcp47.load_wiki_q()
    >>> bcp47.wiki_q['en-GI-spanglis']
    'Q79388'

    c                 2   t                                          ||           i | _        |                     d          5 }|                     |                                                    d                    | _        ddd           n# 1 swxY w Y   |                     d          5 }|                     t          j
        |                              d                    | _        ddd           n# 1 swxY w Y   |                                  dS )zRead the BCP-47 databasez!iana/language-subtag-registry.txtz%%
Nzcldr/common-subdivisions-en.xmlz+localeDisplayNames/subdivisions/subdivision)super__init__langcodeopen	data_dictreadsplitdbsubdiv_dictetparseiterfindsubdiv
morphology)selfrootfileidsfp	__class__s       L/var/www/piapp/venv/lib/python3.11/site-packages/nltk/corpus/reader/bcp47.pyr
   zBCP47CorpusReader.__init__    s^   w'''YY:;; 	>rnnRWWYY__V%<%<==DG	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>YY899 	R**%%&STT DK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s&    A BBB,A C88C<?C<c                    |                      d          5 }|                     |                                                                                    d          dd                   | _        ddd           dS # 1 swxY w Y   dS )z:Load conversion table to Wikidata Q-codes (only if needed)z-cldr/tools-cldr-rdf-external-entityToCode.tsv
   N)r   	wiki_dictr   stripr   wiki_q)r   r   s     r   load_wiki_qzBCP47CorpusReader.load_wiki_q,   s    YYFGG 	L2..):):)@)@)F)Fqrr)JKKDK	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	Ls   AA==BBc                 .    d d |D             D             S )z7Convert Wikidata list of Q-codes to a BCP-47 dictionaryc                 ^    i | ]*}|d          |d                              d          d         +S )r   r   /r   ).0pairs     r   
<dictcomp>z/BCP47CorpusReader.wiki_dict.<locals>.<dictcomp>3   sC     
 
 
 GT!W]]3''+
 
 
    c                 \    g | ])}|                                                     d           *S )	)r!   r   )r)   lines     r   
<listcomp>z/BCP47CorpusReader.wiki_dict.<locals>.<listcomp>5   s.    DDDd++D11DDDr,    )r   liness     r   r    zBCP47CorpusReader.wiki_dict1   s1    
 
DDeDDD
 
 
 	
r,   c                     d |D             S )z2Convert the CLDR subdivisions list to a dictionaryc                 4    i | ]}|j         d          |j        S )type)attribtext)r)   subs     r   r+   z1BCP47CorpusReader.subdiv_dict.<locals>.<dictcomp>:   s#    @@@
6"CH@@@r,   r1   )r   subdivss     r   r   zBCP47CorpusReader.subdiv_dict8   s    @@@@@@r,   c           
         t           j        t           j        t           j        t           j        t           j        d| _        d}d}d}d}t          j        |dz   d          t          j        |dz             t          j        | |dz             t          j        d|d	z   d
|dz   d          t          j        |dz   |dz   dz             t          j        |           d| _        d S )N)languageextlangscriptregionvariantz[0-9]z[a-z]z[A-Z]z[a-zA-Z0-9]   ?(   z)|()   )r;   r<   r=   r>   r?   	singleton)strlowertitleuppercasingrecompileformat)r   diglowupalnums        r   r   zBCP47CorpusReader.morphology<   s    	yiiy
 
 
c!e;;;//zSU*--jBA00j!6RT!6!6c!e!6!6!677zU1W"=uSy!m"="=>>sH--
 
r,   c                 P   |d                              dd                                          | _        i }i |d<   dD ]}i |d         |<   |dd         D ]O}d |                                                    d	          D             }|d         d         }|d         d         }||vri ||<   i }|d
d         D ]}	t	          |	          d
k    r,|	\  }
}|
|vr|g||
<   nM||
                             |           n1||
         dxx         d|	d                                         z   z  cc<   d|vr"|dk    r|
dk    r|| j        ||
         d         <   |D ],}
t	          ||
                   dk    r||
         d         ||
<   -d|v r||d         |         |<   D|||         |<   Q|S )z;Convert the BCP-47 language subtag registry to a dictionaryr   z
File-Date: 
deprecated)r;   r<   r=   r>   r?   	redundantgrandfatheredr   Nc                 8    g | ]}|                     d           S ): r(   )r)   fields     r   r0   z/BCP47CorpusReader.data_dict.<locals>.<listcomp>a   s$    PPPEekk$''PPPr,   r   rC   r'    
Deprecatedr;   Description)replacer!   versionr   lenappendr   )r   recordsdiclabelrecordfieldstyptag	subfieldsrZ   keyvals               r   r   zBCP47CorpusReader.data_dictQ   s	   qz)),;;AACCL
 		* 		*E (*Ce$$abbk 	* 	*FPPV\\^^5I5I$5O5OPPPF)A,C)A,C#~~CI < <u::??!&JS#)++*-	#!#--c2222cN2&&&#a0@0@*@@&&& ..z))},,8;DM)C."45  7 7y~&&!++%.s^A%6IcNv%%.7L!#&s++ )C
r,   c                 F    t          |          t          k    r|d         }|S )zReturn only first valuer   )r5   list)r   rk   s     r   val2strzBCP47CorpusReader.val2str   s!    99a&C
r,   c                 F    |d          }dD ]}||v r|d||          z  }|S )zConcatenate subtag valuesr;   )r<   r=   r>   r?   	extensionrY   r1   )r   	lg_recordnamerd   s       r   lang2strzBCP47CorpusReader.lang2str   sG    J')L 	0 	0E	!!/Yu-///r,   c                    |                     d          }i }g d}|r8|r5|                    d          }d}|rk|                    d          } | j        |         |          }| j        |                             |          r|| j        |         v rRd}|                     | j        |         |         d                   }|dk    r||v r||xx         d|z   z  cc<   n|||<   n|| j        d	         |         v rd}d
|d| d}	d| j        d	         |         |         v r;| j        d	         |         |         d         }
|	d|                     |
           dz  }	|                     | j        d	         |         |         d                   ||<   t          |	           n|k|s|dk    r2|d         dk    r&|d         }|| j        v r| j        |         }nqd| d}nj| d	                    d |D                        
                                }| j        d                             |          sd| d}t          |           ||d<   g }|r|5|S )z8Convert a BCP-47 tag to a dictionary of labelled subtags-)r;   r<   r=   r>   r?   r?   r   FTr]   r?   rY   rU   The r[   z code is deprecatedPreferred-Valuez', prefer ''usdr   z<Unknown subdivision: >rT   c                     g | ]}d |z   S )ru   r1   )r)   exts     r   r0   z/BCP47CorpusReader.parse_tag.<locals>.<listcomp>   s    -I-I-I#c#g-I-I-Ir,   rF   z<Invalid extension: rp   )r   poprK   rN   	fullmatchr   rn   r   r   joinrH   )r   rh   subtagslanglabelssubtagfoundrd   valstrnotepreferrz   r}   s                r   	parse_tagzBCP47CorpusReader.parse_tag   s   ))C..RRR )	& )	[[^^FE 

1+U+F33;u%//77 // $!%dgenV.D].S!T!T I--%4-- KKK4&=8KKKK*0DK47<#8#??? $KfKKKKK,0Ee0LV0TTT%)W\%:5%A&%I 1&F !$I$,,v2F2F$I$I$IID&*ll GL1%8@O' 'U T


1  2  S==WQZ4%7%7 BT[(("k"o=s===#LRWW-I-I-I-I-I%J%JLLRRTTC;{3==fEE ";S;;;S			$'[!S  )	& )	T r,   c                 L   dD ]}d}|| j         |         v r#| j         |         |         d          }d|d| }n|| j         d         |         v r~| j         d         |         |         d          }d|d| d}d| j         d         |         |         v r:| j         d         |         |         d         }|d	|                     |          z  }|rt          |           |c S 	 |                     |                     |                    S #  t          d
|d           Y dS xY w)z
        Convert a BCP-47 tag to a colon-separated string of subtag names

        >>> from nltk.corpus import bcp47
        >>> bcp47.name('ca-Latn-ES-valencia')
        'Catalan: Latin: Spain: Valencian'

        )rV   rW   Nr]   rv   z	 code is rU   z and deprecatedrw   z	, prefer zTag z was not recognized)r   rn   r   rs   r   )r   rh   rd   rk   r   r   s         r   rr   zBCP47CorpusReader.name   sj    4 	 	ECdgen$$,];=5c55e55-e444.u5c:=IKDcDDeDDD$(=e(DS(III!W\259#>?PQF@V(<(<@@@D T





	==!4!4555	222233344s   #'D D#)__name__
__module____qualname____doc__r
   r#   r    r   r   r   rn   rs   r   rr   __classcell__)r   s   @r   r   r      s          
 
 
 
 
L L L

 
 
A A A
 
 
*, , ,\    / / /b      r,   r   )	rL   warningsr   	xml.etreer   r   nltk.corpus.readerr   r   r1   r,   r   <module>r      s    
			       ' ' ' ' ' ' + + + + + +K K K K K K K K K Kr,   