
    '[f?                         d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZ d Z	 G d de          Z
 G d de          Z G d	 d
          Z G d de          Z G d de          Z G d de          ZdS )    Nconcat)XMLCorpusReaderXMLCorpusViewc                 H     t          j                   d fd	            }|S )zj
    Wraps function arguments:
    if fileids not specified then function set NKJPCorpusReader paths.
    Nc                 *    |s| j         } | |fi |S N)_paths)selffileidskwargsfuns      K/var/www/piapp/venv/lib/python3.11/site-packages/nltk/corpus/reader/nkjp.py	decoratorz_parse_args.<locals>.decorator   s,     	"kGs4++F+++    r	   )	functoolswraps)r   r   s   ` r   _parse_argsr      s@     _S, , , , , ,
 r   c                       e Zd ZdZdZdZdZddZd Zd Z	dd
Z
d Zedd            Zedd            Zedd            Zedd            Zedd            Zd	S )NKJPCorpusReaderr            .*c                     t          |t                    rt          j        | ||dz              n t          j        | |d |D                        |                                 | _        dS )aN  
        Corpus reader designed to work with National Corpus of Polish.
        See http://nkjp.pl/ for more details about NKJP.
        use example:
        import nltk
        import nkjp
        from nkjp import NKJPCorpusReader
        x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus
        x.header()
        x.raw()
        x.words()
        x.tagged_words(tags=['subst', 'comp'])  #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html
        x.sents()
        x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s)
        x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
        x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
        z.*/header.xmlc                     g | ]}|d z   S )z/header.xml ).0fileids     r   
<listcomp>z-NKJPCorpusReader.__init__.<locals>.<listcomp><   s    JJJVm3JJJr   N)
isinstancestrr   __init__	get_pathsr
   )r   rootr   s      r   r#   zNKJPCorpusReader.__init__&   su    $ gs## 	$T4?1JKKKK$dJJ'JJJ   nn&&r   c                 *      fd j         D             S )Nc                     g | ]M}t           j                            t          j                  |                    d           d                   NS 
header.xmlr   )ospathjoinr"   _rootsplit)r   fr   s     r   r    z.NKJPCorpusReader.get_paths.<locals>.<listcomp>A   sR     
 
 
 GLLTZ!'',*?*?*BCC
 
 
r   _fileidsr   s   `r   r$   zNKJPCorpusReader.get_paths@   s0    
 
 
 
]
 
 
 	
r   c                 $    d | j         D             S )zf
        Returns a list of file identifiers for the fileids that make up
        this corpus.
        c                 D    g | ]}|                     d           d         S r(   r.   )r   r/   s     r   r    z,NKJPCorpusReader.fileids.<locals>.<listcomp>K   s)    @@@Q%%a(@@@r   r0   r2   s    r   r   zNKJPCorpusReader.fileidsF   s    
 A@$-@@@@r   Nc                 n   |                     dt          j                  }|t          j        u rt          ||          S |t          j        u rt          ||          S |t          j        u rt          ||          S |t          j        u rt          ||t          j                  S t          d          )zQ
        Returns a view specialised for use with particular corpus file.
        mode)tags)r8   r7   zNo such mode!)popr   
WORDS_MODENKJPCorpus_Morph_View
SENTS_MODENKJPCorpus_Segmentation_ViewHEADER_MODENKJPCorpus_Header_ViewRAW_MODENKJPCorpus_Text_View	NameError)r   filenamer8   r   r7   s        r   _viewzNKJPCorpusReader._viewM   s     zz&"2"=>>#...(====%000/tDDDD%111)(>>>>%...'t*>*G   
 O,,,r   c                 ,    | j         |v r|S | j         |z   S )z<
        Add root if necessary to specified fileid.
        )r%   )r   r   s     r   add_rootzNKJPCorpusReader.add_root`   s#     9My6!!r   c                 >     t           fd|D                       S )z9
        Returns header(s) of specified fileids.
        c                     g | ]A} j                             |          fd t          j        i                                BS r7   )rD   rF   r   r>   handle_queryr   r   r   r   s     r   r    z+NKJPCorpusReader.header.<locals>.<listcomp>n   si         
MM&)) 0@0LPV ,..  r   r   r   r   r   s   ` `r   headerzNKJPCorpusReader.headerh   E    
      &	  
 
 	
r   c                 >     t           fd|D                       S )z9
        Returns sentences in specified fileids.
        c                     g | ]A} j                             |          fd t          j        i                                BS rI   )rD   rF   r   r<   rJ   rK   s     r   r    z*NKJPCorpusReader.sents.<locals>.<listcomp>|   i         
MM&)) 0@0KOU ,..  r   r   rL   s   ` `r   sentszNKJPCorpusReader.sentsv   rN   r   c                 >     t           fd|D                       S )5
        Returns words in specified fileids.
        c                     g | ]A} j                             |          fd t          j        i                                BS rI   rD   rF   r   r:   rJ   rK   s     r   r    z*NKJPCorpusReader.words.<locals>.<listcomp>   rQ   r   r   rL   s   ` `r   wordszNKJPCorpusReader.words   sE          &	  
 
 	
r   c                 n                          dg           t           fd|D                       S )z
        Call with specified tags as a list, e.g. tags=['subst', 'comp'].
        Returns tagged words in specified fileids.
        r8   c                     g | ]B} j                             |          ft          j        d                                 CS ))r7   r8   rV   )r   r   r   r   r8   s     r   r    z1NKJPCorpusReader.tagged_words.<locals>.<listcomp>   sp         
MM&)))4  	 
 ,..  r   )r9   r   )r   r   r   r8   s   ` `@r   tagged_wordszNKJPCorpusReader.tagged_words   s^     zz&"%%      &  

 

 
	
r   c                 >     t           fd|D                       S )rT   c                     g | ]A} j                             |          fd t          j        i                                BS rI   )rD   rF   r   r@   rJ   rK   s     r   r    z(NKJPCorpusReader.raw.<locals>.<listcomp>   si         
MM&)) 0@0IMS ,..  r   r   rL   s   ` `r   rawzNKJPCorpusReader.raw   rN   r   )r   r	   )__name__
__module____qualname__r:   r<   r>   r@   r#   r$   r   rD   rF   r   rM   rR   rW   rZ   r]   r   r   r   r   r       s        JJKH' ' ' '4
 
 
A A A- - - -&" " " 
 
 
 [
 
 
 
 [
 
 
 
 [
 
 
 
 [
$ 
 
 
 [
 
 
r   r   c                        e Zd Zd Zd Zd ZdS )r?   c                 P    d| _         t          j        | |dz   | j                    dS )z
        HEADER_MODE
        A stream backed corpus view specialized for use with
        header.xml files in NKJP corpus.
        z.*/sourceDesc$r)   N)tagspecr   r#   r   rC   r   s      r   r#   zNKJPCorpus_Header_View.__init__   s-     (tX%<dlKKKKKr   c                     |                                   g }	 t          j        | | j                  }t	          |          dk    rn|                    |           D|                                  |S NTr   )_openr   
read_block_streamlenextendclose)r   rM   segms      r   rJ   z#NKJPCorpus_Header_View.handle_query   sj    

	  +D$,??D4yyA~~MM$		 
 	

r   c                    |                     d          }g }|rd                    d |D                       }|                     d          }g }|rd                    d |D                       }|                     d          }g }|rd                    d |D                       }|                     d          }	g }
|	rd                    d	 |	D                       }
|                     d
          }g }|rd                    d |D                       }|                     d          }g }|rd                    d |D                       }||||
||dS )Nz
bibl/title
c              3   H   K   | ]}|j                                         V  d S r	   textstrip)r   titles     r   	<genexpr>z4NKJPCorpus_Header_View.handle_elt.<locals>.<genexpr>   s0      EEUej..00EEEEEEr   zbibl/authorc              3   H   K   | ]}|j                                         V  d S r	   rq   )r   authors     r   ru   z4NKJPCorpus_Header_View.handle_elt.<locals>.<genexpr>   s0      IIvv{0022IIIIIIr   z	bibl/datec              3   H   K   | ]}|j                                         V  d S r	   rq   )r   dates     r   ru   z4NKJPCorpus_Header_View.handle_elt.<locals>.<genexpr>   .      AA4TY__..AAAAAAr   zbibl/publisherc              3   H   K   | ]}|j                                         V  d S r	   rq   )r   	publishers     r   ru   z4NKJPCorpus_Header_View.handle_elt.<locals>.<genexpr>   s0      !U!UY)."6"6"8"8!U!U!U!U!U!Ur   z	bibl/idnoc              3   H   K   | ]}|j                                         V  d S r	   rq   )r   idnos     r   ru   z4NKJPCorpus_Header_View.handle_elt.<locals>.<genexpr>   rz   r   z	bibl/notec              3   H   K   | ]}|j                                         V  d S r	   rq   )r   notes     r   ru   z4NKJPCorpus_Header_View.handle_elt.<locals>.<genexpr>   rz   r   )rt   rw   ry   r|   r~   r   )findallr,   )r   eltcontexttitlesrt   authorsrw   datesry   
publishersr|   idnosr~   notesr   s                  r   
handle_eltz!NKJPCorpus_Header_View.handle_elt   s   \** 	FIIEEfEEEEEE++m,, 	JYYIIIIIIIFK(( 	B99AA5AAAAAD[[!122
	 	V		!U!U*!U!U!UUUIK(( 	B99AA5AAAAADK(( 	B99AA5AAAAAD "
 
 	
r   N)r^   r_   r`   r#   rJ   r   r   r   r   r?   r?      sD        L L L	 	 	&
 &
 &
 &
 &
r   r?   c                   $    e Zd ZdZd Zd Zd ZdS )XML_Toola  
    Helper class creating xml file to one without references to nkjp: namespace.
    That's needed because the XMLCorpusView assumes that one can find short substrings
    of XML that are valid XML, which is not true if a namespace is declared at top level
    c                     t           j                            ||          | _        t	          j        d          | _        d S )NF)delete)r*   r+   r,   	read_filetempfileNamedTemporaryFile
write_file)r   r%   rC   s      r   r#   zXML_Tool.__init__   s2    dH55"5UCCCr   c                 8   	 t          | j                  }| j        }d}t          |          r|                                }t          j        d|          }d                    |          }t          j        d|          }d                    |          }t          j        d|          }d                    |          }t          j        d|          }d                    |          }t          j        d|          }d                    |          }|                    |           t          |          |	                                 |	                                 | j        j
        S # t          $ r!}|                                  t          |d }~ww xY w)N znkjp:[^ ]* z<nkjp:paren>z</nkjp:paren>z<choice>z	</choice>)openr   r   rj   readlinerer.   r,   writerl   name	Exceptionremove_preprocessed_file)r   frfwlinexretes          r   build_preprocessed_filez XML_Tool.build_preprocessed_file   s\   	#dn%%BBDd)) {{}}H^T22hhqkkH^S11hhqkkH_c22hhqkkHZ--hhqkkH[#..hhqkk d))  HHJJJHHJJJ?'' 	# 	# 	#))+++"	#s   E+E. .
F8FFc                 B    t          j        | j        j                   d S r	   )r*   remover   r   r2   s    r   r   z!XML_Tool.remove_preprocessed_file  s    
	$/&'''''r   N)r^   r_   r`   __doc__r#   r   r   r   r   r   r   r      sN         D D D# # #2( ( ( ( (r   r   c                   B    e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
S )r=   zm
    A stream backed corpus view specialized for use with
    ann_segmentation.xml files in NKJP corpus.
    c                    d| _         t          |t          j                  | _        | j                                         t          |d          | _        t          j        | | j        	                                | j                    d S )Nz.*p/.*srI   zann_segmentation.xml)
rc   rA   r<   	text_viewrJ   r   xml_toolr   r#   r   rd   s      r   r#   z%NKJPCorpus_Segmentation_View.__init__!  s     -/:
 
 
 	##%%% +ABB$-77994<	
 	
 	
 	
 	
r   c                 j    |                     d          d                              d          d         S )N(r   ,r   r5   )r   example_words     r   get_segm_idz(NKJPCorpus_Segmentation_View.get_segm_id/  s.    !!#&&q)//44Q77r   c                 R    t          |                    d          d                   S )Nr   r   )intr.   )r   beg_words     r   get_sent_begz)NKJPCorpus_Segmentation_View.get_sent_beg2  s!    8>>#&&q)***r   c                     |                     d          d                              d          }t          |d                   t          |d                   z   S )N)r   r   r   r   )r.   r   )r   end_wordsplitteds      r   get_sent_endz)NKJPCorpus_Segmentation_View.get_sent_end6  sI    >>#&&q)//448A;#hqk"2"222r   c                     |                      |d                   }| j        j        |         }|                     |d                   }|                     |t          |          dz
                     }|||         S )Nr   r   )r   r   	segm_dictr   r   rj   )r   	sent_segmidrm   begends         r   get_sentencesz*NKJPCorpus_Segmentation_View.get_sentences;  sp    il++~'+	!--	#i..1*< =>>CG}r   c                     g }d}d}|D ]e}|                      |          }|                     |          |dz
  k    s||k    r*|                    |           |                     |          }|}f|S )Nr   )r   r   appendr   )r   rm   r   prev_txt_endprev_txt_nrwordtxt_nrs          r   remove_choicez*NKJPCorpus_Segmentation_View.remove_choiceC  s     	! 	!D%%d++F  &&)999[F=R=R

4   #0066 KK
r   c                    	 |                                   g }	 t          j        | | j                  }t	          |          dk    rnC|D ]?}|                     |          }|                    |                     |                     @q|                                  | j	        
                                 |S # t          $ r&}| j	        
                                 t          |d }~ww xY wrf   )rg   r   rh   ri   rj   r   r   r   rl   r   r   r   )r   	sentencesr   rm   r   s        r   rJ   z)NKJPCorpus_Segmentation_View.handle_queryQ  s    	#JJLLLI?)4T4<HH	y>>Q&&% ? ?D--d33D$$T%7%7%=%=>>>>? JJLLLM22444 	# 	# 	#M22444"	#s   B6B9 9
C)!C$$C)c                 d    g }|D ]*}|                     |                    d                     +|S )Ncorresp)r   get)r   r   r   r   segs        r   r   z'NKJPCorpus_Segmentation_View.handle_eltc  s<     	+ 	+CJJswwy))****
r   N)r^   r_   r`   r   r#   r   r   r   r   r   rJ   r   r   r   r   r=   r=     s         

 
 
8 8 8+ + +3 3 3
    # # #$    r   r=   c                   :    e Zd ZdZdZdZd Zd Zd
dZd Z	d	 Z
dS )rA   za
    A stream backed corpus view specialized for use with
    text.xml files in NKJP corpus.
    r   r   c                     |                     dd          | _        d| _        t                      | _        t          |d          | _        t          j        | | j        	                                | j                   d S )Nr7   r   z	.*/div/abztext.xml)
r9   r7   rc   dictr   r   r   r   r#   r   rd   s      r   r#   zNKJPCorpus_Text_View.__init__s  sp    JJvq))	" :66$-77994<	
 	
 	
 	
 	
r   c                 $   	 |                                   |                     | j                  }|                                  | j                                         |S # t          $ r&}| j                                         t          |d }~ww xY wr	   )rg   rh   ri   rl   r   r   r   )r   r   r   s      r   rJ   z!NKJPCorpus_Text_View.handle_query~  s    	#JJLLL--AJJLLLM22444H 	# 	# 	#M22444"	#s   AA 
B)!B

BNc                     g }	 t          j        | |          }t          |          dk    rn|D ]}|                    |           Dd                    d |D                       gS )z6
        Returns text as a list of sentences.
        Tr   r   c                     g | ]}|S r   r   )r   rm   s     r   r    z3NKJPCorpus_Text_View.read_block.<locals>.<listcomp>  s    ///4$///r   )r   rh   rj   r   r,   )r   streamrc   elt_handlertxtrm   parts          r   rh   zNKJPCorpus_Text_View.read_block  s     	! +D&99D4yyA~~ ! !

4    	! //3///0011r   c                 r    |j         D ].}|                    d          r|                    |          c S /d S )Nr   )attribendswithr   )r   r   attrs      r   r   z NKJPCorpus_Text_View.get_segm_id  sL    J 	% 	%D}}T"" %wwt}}$$$%	% 	%r   c                 z    | j         t          j        u r"|j        | j        |                     |          <   |j        S r	   )r7   rA   r<   rr   r   r   )r   r   r   s      r   r   zNKJPCorpus_Text_View.handle_elt  s6    9,77747HDN4++C001xr   )NN)r^   r_   r`   r   r<   r@   r#   rJ   rh   r   r   r   r   r   rA   rA   j  sz         
 JH	
 	
 	
	# 	# 	#2 2 2 2% % %
    r   rA   c                   $    e Zd ZdZd Zd Zd ZdS )r;   zm
    A stream backed corpus view specialized for use with
    ann_morphosyntax.xml files in NKJP corpus.
    c                     |                     dd           | _        d| _        t          |d          | _        t          j        | | j                                        | j                   d S )Nr8   z	.*/seg/fszann_morphosyntax.xml)r9   r8   rc   r   r   r   r#   r   rd   s      r   r#   zNKJPCorpus_Morph_View.__init__  sf    JJvt,,	" +ABB$-77994<	
 	
 	
 	
 	
r   c                    	 |                                   g }	 t          j        | | j                  }t	          |          dk    rn|D ]}||                    |           K|                                  | j                                         |S # t          $ r&}| j                                         t          |d }~ww xY wrf   )
rg   r   rh   ri   rj   r   rl   r   r   r   )r   rW   rm   r   r   s        r   rJ   z"NKJPCorpus_Morph_View.handle_query  s    	#JJLLLE+$/dlCCt99>>  + +D'T***+ JJLLLM22444L 	# 	# 	#M22444"	#s   BB 
C!B>>Cc                    d}d}d}| j         d}|D ]#}d|                                v r)|j        d         dk    r|D ]}|j        dk    r|j        }Bd|                                v r|j        d         dk    r|D ]}d|                                v r|j        d         d	k    r|D ]}d|                                v rs|j        d         d
k    rb|D ]_}	d|	                                v r| j         |	j        d         | j         v rd}6d|	                                v r|	j        d         dk    rd}`%|r|r|S d S d S )N FTr   orthstringinterpstypelexctagvalueinterp)r8   keysr   tagrr   )
r   r   r   r   flagis_not_interpchildsymbolsymbol2symbol3s
             r   r   z NKJPCorpus_Morph_View.handle_elt  s   9D 	> 	>E %%%,v*>&*H*H# + +FzX--%{+ 5::<<''EL,@I,M,M# > >F..6=3HE3Q3Q'- > >G &',,.. 8 8$+N6$:f$D$D/6 !> !>G(/7<<>>(A(A,0I,A,3N7,Cty,P,P/3(/7<<>>(A(A,3N7,Cx,O,O8= 	M 	K	 	 	 	r   N)r^   r_   r`   r   r#   rJ   r   r   r   r   r;   r;     sK         

 
 
# # #$$ $ $ $ $r   r;   )r   r*   r   r   nltk.corpus.reader.utilr   nltk.corpus.reader.xmldocsr   r   r   r   r?   r   r=   rA   r;   r   r   r   <module>r      s       				 				  * * * * * * E E E E E E E E  R
 R
 R
 R
 R
 R
 R
 R
j;
 ;
 ;
 ;
 ;
] ;
 ;
 ;
|%( %( %( %( %( %( %( %(PL L L L L= L L L^6 6 6 6 6= 6 6 6rD D D D DM D D D D Dr   