
    '[f 0                     z   d dl T d dlmZ e                    d          Ze                    d          Ze                    d          Ze                    d          Ze                    d          Z	e                    d          Z
e                    d	          Z G d
 de          Z G d dee          ZdS )    )*)XMLCorpusReaderz<p(?: [^>]*){0,1}>(.*?)</p>z<s(?: [^>]*){0,1}>(.*?)</s>z#<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>z!<[wc](?: [^>]*){0,1}>(.*?)</[wc]>ztype="(.*?)"zana="(.*?)"ztext id="(.*?)"c                   ,    e Zd Z	 	 	 ddZdZd Zd ZdS )TEICorpusViewNr   c                 x    || _         || _        || _        || _        t                              | ||           d S )N)startpos)_tagged_textids_group_by_sent_group_by_paraStreamBackedCorpusView__init__)selfcorpus_filetaggedgroup_by_sentgroup_by_paratagsethead_lentextidss           M/var/www/piapp/venv/lib/python3.11/site-packages/nltk/corpus/reader/pl196x.pyr   zTEICorpusView.__init__   sC     ++''kH'MMMMM    i   c           
         |                     | j                  }t          |          }|                    d          |                    d          k    s|                    d          dk    rr|                                }t          |          dk    rnJ||z  }|                    d          |                    d          k    Y|                    d          dk    r|                    dd          }t                              |          }| j	        rk|D ]h}|| j	        vr]|
                    |          dz
  }||d          
                    d          t          d          z   }|d |         |||z   d          z   }ig }t                              |          D ]}	g }
t                              |	          D ]}| j        st                              |          }n:t          t!          | j        t$                              |                              }| j        r|
                    |           {|
                    |           | j        r|                    |
           |                    |
           |S )Nz<text idz</text>r   
    )	readlines	_pagesizeconcatcountreadlinelenreplaceTEXTIDfindallr
   findPARASENTr	   WORDlistmap
_parse_tag
TAGGEDWORDr   appendextendr   )r   streamblocktmpr   tidbegendoutputpara_strparasent_strsents                r   
read_blockzTEICorpusView.read_block-   sx     00u{{:&&Y)?)???EKKE
 E
E E //##C3xx1}}SLE {{:&&Y)?)???EKKE
 E
E E dB''..''= 	= = =dm++**S//A-C+**955IFC!$3$K%c	*<<EU++ 	$ 	$HD LL22 & &| T<<11DDDOZ5G5G5Q5Q R RSSD& &KK%%%%KK%%%%" $d####d####r   c                     |\  }}|                     d          r.t                              |                              d          }n-t                              |                              d          }||fS )Nwr   )
startswithANAsearchgroupTYPE)r   tag_word_tupletagwords       r   r,   zTEICorpusView._parse_tagT   sg    $d>># 	,**S//''**CC++c""((++CSyr   )Nr   N)__name__
__module____qualname__r   r   r;   r,    r   r   r   r      s]         N N N N& I% % %N    r   r   c                   r    e Zd ZdZd Zd Zd ZddZd ZddZ	dd	Z
dd
ZddZddZddZddZddZdS )Pl196xCorpusReaderi
  c                     d|v r|d         | _         nd | _         t          j        | g|R   t                              | |           |                                  d S )Ntextid_file)r
   r   r   CategorizedCorpusReader_init_textids)r   argskwargss      r   r   zPl196xCorpusReader.__init__`   si    F"""=1DMM DM -----((v666r   c           	         t          t                    | _        t          t                    | _        | j        t          | j                  5 }|D ]}|                                }|                    dd          \  }}||                                 vrt          d| j        d|d          |                    | j
                  D ]}|                     ||           	 d d d            d S # 1 swxY w Y   d S d S )N r   zIn text_id mapping file z: z
 not found)defaultdictr*   _f2t_t2fr
   openstripsplitfileids
ValueError
_delimiter_add_textids)r   fplinefile_idtext_idstext_ids         r   rO   z Pl196xCorpusReader._init_textidsk   sR   %%	%%	=$dm$$ 
< 	< 	<D::<<D(,

3(:(:%GXdllnn44(j#}}}ggg7   $,>>$/#B#B < <))'7;;;;<	<
< 
< 
< 
< 
< 
< 
< 
< 
< 
< 
< 
< 
< 
< 
< 
< 
< 
< %$s   BC33C7:C7c                     | j         |                             |           | j        |                             |           d S N)rU   r.   rV   )r   r`   rb   s      r   r]   zPl196xCorpusReader._add_textids{   s>    	'!!'***	'!!'*****r   Nc           
          d }t          t          t          d |||f                              dk    rt          d          ||d fS |                     |          d fS |xt          |t                    r|g}t           fd|D             g           }t                      }|D ]/}t           j
        |                   t          |          z  ||<   0||fS d S )Nc                 
    | d u S rd   rI   )accessors    r   <lambda>z-Pl196xCorpusReader._resolve.<locals>.<lambda>   s    T)9 r   r   z6Specify exactly one of: fileids, categories or textidsc              3   2   K   | ]}j         |         V  d S rd   )rV   ).0tr   s     r   	<genexpr>z.Pl196xCorpusReader._resolve.<locals>.<genexpr>   s)      77!1777777r   )r"   r*   filterr[   rZ   
isinstancestrsumdictsetrU   )r   rZ   
categoriesr   r2   filestdictfs   `       r   _resolvezPl196xCorpusReader._resolve   s%   99 *g6      K   D= !<<
++T11'3'' $")7777w777<<EFFE < <ty|,,s7||;a%< r   c                     |S rd   rI   )r   rD   s     r   
decode_tagzPl196xCorpusReader.decode_tag   s    
r   c                                            ||          \  }}|t           j                  S t          |t                    r|g}t          t           fd|D             g                     S )an  
        In the pl196x corpus each category is stored in single
        file and thus both methods provide identical functionality. In order
        to accommodate finer granularity, a non-standard textids() method was
        implemented. All the main functions can be supplied with a list
        of required chunks---giving much more control to the user.
        Nc              3   2   K   | ]}j         |         V  d S rd   )rU   )rj   dr   s     r   rl   z-Pl196xCorpusReader.textids.<locals>.<genexpr>   s)      99A49Q<999999r   )rw   sortedrV   rn   ro   rp   r   rZ   rs   _s   `   r   r   zPl196xCorpusReader.textids   sw     ]]7J77
?$)$$$gs## 	 iGc99999992>>???r   c                                            ||          \  }| j        }nt          |t                    r|g}rt	           fd|D                       S t	           fd|D                       S )Nc                 v    g | ]5}t                              |          d d d j        |                   6S )Fr   r   r   abspathr   rj   fileidr   r   s     r   
<listcomp>z,Pl196xCorpusReader.words.<locals>.<listcomp>   s^     
 
 
  "V,,!% '  
 
 
r   c           	      h    g | ].}t                              |          d d d j                  /S )Fr   r   rj   r   r   s     r   r   z,Pl196xCorpusReader.words.<locals>.<listcomp>   sW     	 	 	  "V,,!%  	 	 	r   rw   _fileidsrn   ro   r   r   rZ   rs   r   s   `  `r   wordszPl196xCorpusReader.words   s    ==*gFF?mGG%% 	 iG 	
 
 
 
 
 #*
 
 
   	 	 	 	 #*	 	 	  r   c                                            ||          \  }| j        }nt          |t                    r|g}rt	           fd|D                       S t	           fd|D                       S )Nc                 v    g | ]5}t                              |          d dd j        |                   6S FTr   r   r   s     r   r   z,Pl196xCorpusReader.sents.<locals>.<listcomp>   s^     
 
 
  "V,,!% '  
 
 
r   c           	      h    g | ].}t                              |          d dd j                  /S FTr   r   r   s     r   r   z,Pl196xCorpusReader.sents.<locals>.<listcomp>   sS         "V,,eT54=    r   r   r   s   `  `r   sentszPl196xCorpusReader.sents       ==*gFF?mGG%% 	 iG 	
 
 
 
 
 #*
 
 
       #*	    r   c                                            ||          \  }| j        }nt          |t                    r|g}rt	           fd|D                       S t	           fd|D                       S )Nc                 v    g | ]5}t                              |          d ddj        |                   6S r   r   r   s     r   r   z,Pl196xCorpusReader.paras.<locals>.<listcomp>   s^     
 
 
  "V,,!% '  
 
 
r   c           	      h    g | ].}t                              |          d ddj                  /S r   r   r   s     r   r   z,Pl196xCorpusReader.paras.<locals>.<listcomp>  sS         "V,,eT4$-    r   r   r   s   `  `r   paraszPl196xCorpusReader.paras   r   r   c                                            ||          \  }| j        }nt          |t                    r|g}rt	           fd|D                       S t	           fd|D                       S )Nc                 v    g | ]5}t                              |          d ddj        |                   6S TFr   r   r   s     r   r   z3Pl196xCorpusReader.tagged_words.<locals>.<listcomp>  s^     
 
 
  "V,,!% '  
 
 
r   c           	      h    g | ].}t                              |          d ddj                  /S TFr   r   r   s     r   r   z3Pl196xCorpusReader.tagged_words.<locals>.<listcomp>,  sS         "V,,dE54=    r   r   r   s   `  `r   tagged_wordszPl196xCorpusReader.tagged_words  r   r   c                                            ||          \  }| j        }nt          |t                    r|g}rt	           fd|D                       S t	           fd|D                       S )Nc                 v    g | ]5}t                              |          d d dj        |                   6S r   r   r   s     r   r   z3Pl196xCorpusReader.tagged_sents.<locals>.<listcomp>=  s^     
 
 
  "V,,!% '  
 
 
r   c           	      h    g | ].}t                              |          d d dj                  /S r   r   r   s     r   r   z3Pl196xCorpusReader.tagged_sents.<locals>.<listcomp>K  sS         "V,,dD%$-    r   r   r   s   `  `r   tagged_sentszPl196xCorpusReader.tagged_sents4  r   r   c                                            ||          \  }| j        }nt          |t                    r|g}rt	           fd|D                       S t	           fd|D                       S )Nc                 v    g | ]5}t                              |          d d d j        |                   6S )Tr   r   r   s     r   r   z3Pl196xCorpusReader.tagged_paras.<locals>.<listcomp>\  s^     
 
 
  "V,,!% '  
 
 
r   c           	      h    g | ].}t                              |          d d d j                  /S )Tr   r   r   s     r   r   z3Pl196xCorpusReader.tagged_paras.<locals>.<listcomp>j  sS         "V,,dD$    r   r   r   s   `  `r   tagged_paraszPl196xCorpusReader.tagged_parasS  r   r   c                     |                      ||          \  }}t          |          dk    rt          j        | |d                   S t	          d          )Nr   r   zExpected a single file)rw   r"   r   xml	TypeErrorr~   s       r   r   zPl196xCorpusReader.xmlr  sP    ]]7J77
w<<1"&tWQZ8884555r   rd   )NN)NNN)rF   rG   rH   r   r   rO   r]   rw   ry   r   r   r   r   r   r   r   r   rI   r   r   rK   rK   ]   s       H	 	 	< < < + + +       B  @ @ @ @ ! ! ! !F   >   >   >   >   >6 6 6 6 6 6r   rK   N)nltk.corpus.reader.apinltk.corpus.reader.xmldocsr   recompiler'   r(   r-   r)   rB   r?   r$   r   r   rN   rK   rI   r   r   <module>r      s   % $ $ $ 6 6 6 6 6 6	zz011	zz011ZZ>??
	zz677	zz/""jj  	&	'	'C C C C C* C C CLZ6 Z6 Z6 Z6 Z60/ Z6 Z6 Z6 Z6 Z6r   