
    '[f$3                     h    d dl Z d dlmZ d dlmZmZ d Z G d de          Z G d de          ZdS )	    N)CorpusReader)StreamBackedCorpusViewconcatc                 H     t          j                   d fd	            }|S )Nc                 p    |                     dd            |s|                                 } | |fi |S )Ntags)popfileids)selfr
   kwargsfuns      M/var/www/piapp/venv/lib/python3.11/site-packages/nltk/corpus/reader/ipipan.py	decoratorz_parse_args.<locals>.decorator   sF    

64    	%llnnGs4++F+++    N)	functoolswraps)r   r   s   ` r   _parse_argsr      s>    _S, , , , , , r   c                       e Zd ZdZd ZddZddZddZddZe	dd            Z
e	dd	            Ze	dd
            Ze	dd            Ze	dd            Ze	dd            Zd Zd Zd ZddZd Zd Zd ZdS )IPIPANCorpusReadera5  
    Corpus reader designed to work with corpus created by IPI PAN.
    See http://korpus.pl/en/ for more details about IPI PAN corpus.

    The corpus includes information about text domain, channel and categories.
    You can access possible values using ``domains()``, ``channels()`` and
    ``categories()``. You can use also this metadata to filter files, e.g.:
    ``fileids(channel='prasa')``, ``fileids(categories='publicystyczny')``.

    The reader supports methods: words, sents, paras and their tagged versions.
    You can get part of speech instead of full tag by giving "simplify_tags=True"
    parameter, e.g.: ``tagged_sents(simplify_tags=True)``.

    Also you can get all tags disambiguated tags specifying parameter
    "one_tag=False", e.g.: ``tagged_paras(one_tag=False)``.

    You can get all tags that were assigned by a morphological analyzer specifying
    parameter "disamb_only=False", e.g. ``tagged_words(disamb_only=False)``.

    The IPIPAN Corpus contains tags indicating if there is a space between two
    tokens. To add special "no space" markers, you should specify parameter
    "append_no_space=True", e.g. ``tagged_words(append_no_space=True)``.
    As a result in place where there should be no space between two tokens new
    pair ('', 'no-space') will be inserted (for tagged data) and just '' for
    methods without tags.

    The corpus reader can also try to append spaces between words. To enable this
    option, specify parameter "append_space=True", e.g. ``words(append_space=True)``.
    As a result either ' ' or (' ', 'space') will be inserted between tokens.

    By default, xml entities like &quot; and &amp; are replaced by corresponding
    characters. You can turn off this feature, specifying parameter
    "replace_xmlentities=False", e.g. ``words(replace_xmlentities=False)``.
    c                 6    t          j        | ||d d            d S r   )r   __init__)r   rootr
   s      r   r   zIPIPANCorpusReader.__init__=   s!    dD'4>>>>>r   Nc                 Z    |s|                                  }|                     |d          S )Nchannelr
   _parse_headerr   r
   s     r   channelszIPIPANCorpusReader.channels@   s-     	%llnnG!!'9555r   c                 Z    |s|                                  }|                     |d          S )Ndomainr   r   s     r   domainszIPIPANCorpusReader.domainsE   s-     	%llnnG!!'8444r   c                 t     |s                                  } fd                     |d          D             S )Nc                 :    g | ]}                     |          S  )_map_category).0catr   s     r   
<listcomp>z1IPIPANCorpusReader.categories.<locals>.<listcomp>M   s4     
 
 
(+Ds##
 
 
r   keyTermr   r   s   ` r   
categorieszIPIPANCorpusReader.categoriesJ   sS     	%llnnG
 
 
 
/3/A/A'9/U/U
 
 
 	
r   c                    |||t          d          |||t          j        |           S t          |t                    r|g}t          |t                    r|g}t          |t                    r|g}|r|                     d|          S |r|                     d|          S |                     d|| j                  S )NzNYou can specify only one of channels, domains and categories parameter at oncer   r!   r*   )map)
ValueErrorr   r
   
isinstancestr_list_morph_files_byr&   )r   r   r"   r+   s       r   r
   zIPIPANCorpusReader.fileidsQ   s    G$7J<R3   J4F'---h$$ 	" zHgs## 	 iGj#&& 	&$J 	,,YAAA 	,,Xw???,,:4+= -   r   c                 d     t           fd                     |          D                       S )Nc                 F    g | ]} j         |ft          j        d dS F)moder   _viewIPIPANCorpusView
SENTS_MODEr'   fileidr   r   s     r   r)   z,IPIPANCorpusReader.sents.<locals>.<listcomp>k   V         
!1!<5 LR   r   r   _list_morph_filesr   r
   r   s   ` `r   sentszIPIPANCorpusReader.sentsh   Q         #44W==	  
 
 	
r   c                 d     t           fd                     |          D                       S )Nc                 F    g | ]} j         |ft          j        d dS r4   r7   r8   
PARAS_MODEr:   s     r   r)   z,IPIPANCorpusReader.paras.<locals>.<listcomp>v   r<   r   r=   r?   s   ` `r   paraszIPIPANCorpusReader.parass   rA   r   c                 d     t           fd                     |          D                       S )Nc                 0    g | ]} j         |fd diS )r   Fr7   r:   s     r   r)   z,IPIPANCorpusReader.words.<locals>.<listcomp>   sB        
688888  r   r=   r?   s   ` `r   wordszIPIPANCorpusReader.words~   O        "44W==  
 
 	
r   c                 d     t           fd                     |          D                       S )Nc                 D    g | ]} j         |fd t          j        iS r5   r6   r:   s     r   r)   z3IPIPANCorpusReader.tagged_sents.<locals>.<listcomp>   F        
6NN(8(CNvNN  r   r=   r?   s   ` `r   tagged_sentszIPIPANCorpusReader.tagged_sents   rK   r   c                 d     t           fd                     |          D                       S )Nc                 D    g | ]} j         |fd t          j        iS rN   rD   r:   s     r   r)   z3IPIPANCorpusReader.tagged_paras.<locals>.<listcomp>   rO   r   r=   r?   s   ` `r   tagged_paraszIPIPANCorpusReader.tagged_paras   rK   r   c                 d     t           fd                     |          D                       S )Nc                 ,    g | ]} j         |fi S r%   rI   r:   s     r   r)   z3IPIPANCorpusReader.tagged_words.<locals>.<listcomp>   s-    XXXfZTZ))&))XXXr   r=   r?   s   ` `r   tagged_wordszIPIPANCorpusReader.tagged_words   s>    XXXXX8N8Nw8W8WXXX
 
 	
r   c                 @    d |                      |          D             S )Nc                     g | ]}|S r%   r%   r'   fs     r   r)   z8IPIPANCorpusReader._list_morph_files.<locals>.<listcomp>   s    222a222r   )abspathsr   s     r   r>   z$IPIPANCorpusReader._list_morph_files   s"    224==112222r   c                 @    d |                      |          D             S )Nc                 :    g | ]}|                     d d          S )	morph.xml
header.xml)replacerY   s     r   r)   z9IPIPANCorpusReader._list_header_files.<locals>.<listcomp>   s6     
 
 
 IIk<00
 
 
r   )r>   r   s     r   _list_header_filesz%IPIPANCorpusReader._list_header_files   s1    
 
++G44
 
 
 	
r   c                     t                      }|                     |          D ]2}|                     ||          }|D ]}|                    |           3t	          |          S r   )setra   _get_tagaddlist)r   r
   tagvaluesrZ   values_listvs          r   r   z IPIPANCorpusReader._parse_header   sl    ((11 	 	A--3//K   

1F||r   c                 B   |                                  }t                      }|D ]l}|                     |                              dd          }|                     ||          }|D ](}	| ||	          }	|	|v r|                    |           )mt          |          S )Nr^   r_   )r
   rc   abspathr`   rd   re   rf   )
r   rg   rh   r-   r
   ret_fileidsrZ   fpri   values
             r   r1   z'IPIPANCorpusReader._list_morph_files_by   s    ,,..ee 	' 	'Aa((lCCB--C00K$ ' '?CJJEF??OOA&&&	'
 K   r   c                 ^   g }t          |          5 }|                                }d d d            n# 1 swxY w Y   d}	 |                    d|z   |          }|dk     r|S |                    d|z   dz   |          }|                    ||t	          |          z   dz   |                    n)Nr   T<z</>   )openreadfindappendlen)r   rZ   rg   r   infileheadertag_endtag_poss           r   rd   zIPIPANCorpusReader._get_tag   s    !WW 	#[[]]F	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	#	Bkk#)W55G{{kk$*s"2G<<GKKwS1A5?@AAA	Bs   377c                 V    |                     d          }|dk    r|S ||dz   d          S )Nrr      )rv   )r   r(   poss      r   r&   z IPIPANCorpusReader._map_category   s1    hhsmm"99JsQwyy>!r   c                 T   |                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     d	d          }|                     d
d          }	|                     dd          }
t          |          dk    r$t          d|                                z            |s|st          d          |s|s|r|st          d          t	          ||||||||	|
	  	        S )Nr   Tr5   r   simplify_tagsFone_tagdisamb_onlyappend_no_spaceappend_spacereplace_xmlentitieszUnexpected arguments: %sz;You cannot specify both one_tag=False and disamb_only=Falsez[You cannot specify simplify_tags, one_tag or disamb_only with functions other than tagged_*)r   r5   r   r   r   r   r   r   )r	   rx   r.   keysr8   )r   filenamer   r   r5   r   r   r   r   r   r   s              r   r7   zIPIPANCorpusReader._view   sW   zz&$''zz&!$$

?E::**Y--jj55 **%6>>zz.%88$jj)>EEv;;??7&++--GHHH 	{ 	P    	 	g 	[ 	A  
  '#+% 3

 

 

 
	
r   r   )NNN)__name__
__module____qualname____doc__r   r   r"   r+   r
   r   r@   rF   rJ   rP   rS   rV   r>   ra   r   r1   rd   r&   r7   r%   r   r   r   r      s       ! !F? ? ?6 6 6 6
5 5 5 5

 
 
 
   . 
 
 
 [
 
 
 
 [
 
 
 
 [
 
 
 
 [
 
 
 
 [
 
 
 
 [

3 3 3
 
 
  ! ! ! !
B 
B 
B" " " 
  
  
  
  
r   r   c                   :    e Zd ZdZdZdZd
dZd Zd Zd Z	d Z
d	S )r8   r   r   rs   c                    t          j        | |d |d            d| _        d| _        |                    dd          | _        |                    dd          | _        |                    dt          j                  | _	        |                    dd          | _
        |                    dd          | _        |                    d	d          | _        |                    d
d          | _        |                    dd          | _        d S )NFr   r   Tr   r5   r   r   r   r   r   )r   r   in_sentencepositionr	   	show_tagsr   r8   
WORDS_MODEr5   r   r   r   r   r   )r   r   startposr   s       r   r   zIPIPANCorpusView.__init__   s    'hhMMM FD11!::mT::JJv'7'BCC	#ZZ??zz)T22%zz*;UCC"JJ~u==#)::.CT#J#J   r   c                    g }g }d}d}t                      }|                     |          }	 t          |          dk    r*|                     |           |                     |          }|dgk    r|rJ g S |                                }| xj        t          |          dz   z  c_        |                    d          r	d| _        n|                    d          rn|                    d          r6| j        r|r|s| 	                    |           d}d}d}	t                      }n|                    d          r| j        rtd| _        |                     |           | j
        | j        k    r|gS | j
        | j        k    r| j        r| 	                    |           |S |                    |           n| j
        | j        k    r|                     |           |gS n|                    d	          r=|d
d         }	| j        r*|	                    dd                              dd          }	n|                    d          rj| j        r|                    d          dk    rH||                    d          d
z   |                    d                   }
|                    |
           n|                    d          r| j        rp| j        rd |D             }| j        r| j        s%|                    |	t1          |          f           n|                    |	|                                f           n|                    |	           nn|                    d          rC| j        rd}| j        r2| j        r|                    d           n,|                    d           n|                    d          r	 )NFTr    z<chunk type="s"z<chunk type="p"z<tokz</chunkz<orth   iz&quot;"z&amp;&z<lexzdisamb=r~   z<ctagz</ctagz</tokc                 D    g | ]}|                     d           d         S ):r   )split)r'   ts     r   r)   z/IPIPANCorpusView.read_block.<locals>.<listcomp>D  s&    >>>AQ>>>r   z<ns/>)r   zno-spacez</cesAna)rc   
_read_datarx   _seekr	   r   
startswithr   r   _append_spacer5   r9   r   rw   rE   r   r`   r   rv   indexre   r   r   r   tupler   )r   streamsentence	sentencesspaceno_spacer   lineslineorthrg   s              r   
read_blockzIPIPANCorpusView.read_block  s   	uu''C	 5zzQ

6"""//}}$$$$	99;;DMMSYY]*MM011 5#'  !233 3(( 1$ 1 1x 1&&x000 uu++ *# '',D$JJv&&&yDO33 (z)do55, 9 ..x888'!((2222Y$/11JJv&&&%;& 2 )) AbDz+ M<<#66>>wLLD(( ' "499Y+?+?2+E+Etzz'22Q6H9M9MMNCHHSMMM)) > *) ?>>>>>< <t/? < uT{{(;<<<< txxzz(:;;;;OOD)))))) 	$ $#H' ,~ , (89999 +++,, GC	r   c                     |                                 | _        |                    d          }|                    d          }|                                 |S )Ni   
)tellr   ru   r   reverse)r   r   buffr   s       r   r   zIPIPANCorpusView._read_dataV  sE    {{4  

4  r   c                 :    |                     | j                   d S r   )seekr   )r   r   s     r   r   zIPIPANCorpusView._seek]  s    DM"""""r   c                 l    | j         r|                    d           d S |                    d           d S )N) r   r   )r   rw   )r   r   s     r   r   zIPIPANCorpusView._append_space`  s<    > 	!OON+++++OOC     r   N)r   )r   r   r   r   r9   rE   r   r   r   r   r   r%   r   r   r8   r8      s{        JJJK K K KM M M^  # # #! ! ! ! !r   r8   )	r   nltk.corpus.reader.apir   nltk.corpus.reader.utilr   r   r   r   r8   r%   r   r   <module>r      s        / / / / / / B B B B B B B B  W
 W
 W
 W
 W
 W
 W
 W
tq! q! q! q! q!- q! q! q! q! q!r   