
    '[fW                         d Z ddlZddlT ddlT ddlmZ ddlmZ ddlm	Z	m
Z
  G d de          Z G d	 d
          Z G d de          Z G d de          ZdS )z!
Read CoNLL-style chunk fileids.
    N)*)map_tag)Tree)LazyConcatenationLazyMapc                      e Zd ZdZdZdZdZdZdZdZ	dZ
eeeeee	e
fZd	d
ddded	d	fdZd%dZd%dZd&dZd&dZd'dZd'dZd'dZd%dZd(dZd&dZd&dZd%dZd Zd Zd%dZd%dZd%dZd%d Zd! Z d" Z!d# Z"e#d$             Z$d	S ))ConllCorpusReadera  
    A corpus reader for CoNLL-style files.  These files consist of a
    series of sentences, separated by blank lines.  Each sentence is
    encoded using a table (or "grid") of values, where each line
    corresponds to a single word, and each column corresponds to an
    annotation type.  The set of columns used by CoNLL-style files can
    vary from corpus to corpus; the ``ConllCorpusReader`` constructor
    therefore takes an argument, ``columntypes``, which is used to
    specify the columns that are used by a given corpus. By default
    columns are split by consecutive whitespaces, with the
    ``separator`` argument you can set a string to split by (e.g.
    ``'	'``).


    @todo: Add support for reading from corpora where different
        parallel files contain different columns.
    @todo: Possibly add caching of the grid corpus view?  This would
        allow the same grid view to be used by different data access
        methods (eg words() and parsed_sents() could both share the
        same grid corpus view object).
    @todo: Better support for -DOCSTART-.  Currently, we just ignore
        it, but it could be used to define methods that retrieve a
        document at a time (eg parsed_documents()).
    wordspostreechunknesrlignoreNSFTutf8c                 N   |D ]}|| j         vrt          d|z            t          |t                    r|g}|| _        d t          |          D             | _        || _        || _        || _	        |	| _
        t                              | |||           |
| _        || _        d S )NzBad column type %rc                     i | ]\  }}||	S  r   ).0ics      L/var/www/piapp/venv/lib/python3.11/site-packages/nltk/corpus/reader/conll.py
<dictcomp>z.ConllCorpusReader.__init__.<locals>.<dictcomp>W   s    BBB!Q1BBB    )COLUMN_TYPES
ValueError
isinstancestr_chunk_types	enumerate_colmap_pos_in_tree_root_label_srl_includes_roleset_tree_classCorpusReader__init___tagsetsep)selfrootfileidscolumntypeschunk_types
root_labelpos_in_treesrl_includes_rolesetencoding
tree_classtagset	separator
columntypes                r   r(   zConllCorpusReader.__init__C   s     & 	D 	DJ!222 !5
!BCCC 3k3'' 	(&-K'BB9[+A+ABBB'%%9"%dD'8<<<r   c                     |                      | j                   t          t          | j        |                     |                              S N)_requireWORDSr   r   
_get_words_gridsr+   r-   s     r   r
   zConllCorpusReader.wordsd   s<    dj!!! $++g:N:N!O!OPPPr   c                     |                      | j                   t          | j        |                     |                    S r9   )r:   r;   r   r<   r=   r>   s     r   sentszConllCorpusReader.sentsh   s4    dj!!!tG(<(<===r   c                                             j         j                    fd}t          t	          |                     |                              S )Nc                 0                         |           S r9   _get_tagged_wordsgridr+   r5   s    r   get_tagged_wordsz8ConllCorpusReader.tagged_words.<locals>.get_tagged_wordso       ))$777r   )r:   r;   POSr   r   r=   r+   r-   r5   rG   s   ` ` r   tagged_wordszConllCorpusReader.tagged_wordsl   sa    dj$(+++	8 	8 	8 	8 	8 	8 !)94;;w;O;O!P!PQQQr   c                                             j         j                    fd}t          |                     |                    S )Nc                 0                         |           S r9   rC   rE   s    r   rG   z8ConllCorpusReader.tagged_sents.<locals>.get_tagged_wordsw   rH   r   )r:   r;   rI   r   r=   rJ   s   ` ` r   tagged_sentszConllCorpusReader.tagged_sentst   sY    dj$(+++	8 	8 	8 	8 	8 	8 'W)=)=>>>r   c                                             j         j         j                    j         fd}t          t          |                     |                              S )Nc                 2                         |           S r9   _get_chunked_wordsrF   r/   r+   r5   s    r   get_chunked_wordsz:ConllCorpusReader.chunked_words.<locals>.get_chunked_words       **4fEEEr   )r:   r;   rI   CHUNKr    r   r   r=   r+   r-   r/   r5   rT   s   ` `` r   chunked_wordszConllCorpusReader.chunked_words|   s    dj$(DJ777+K	F 	F 	F 	F 	F 	F 	F !):DKK<P<P!Q!QRRRr   c                                             j         j         j                    j         fd}t          |                     |                    S )Nc                 2                         |           S r9   rQ   rS   s    r   rT   z:ConllCorpusReader.chunked_sents.<locals>.get_chunked_words   rU   r   )r:   r;   rI   rV   r    r   r=   rW   s   ` `` r   chunked_sentszConllCorpusReader.chunked_sents   sx    dj$(DJ777+K	F 	F 	F 	F 	F 	F 	F ($++g*>*>???r   c                                             j         j         j                    j         fd}t          |                     |                    S )Nc                 2                         |           S r9   )_get_parsed_sent)rF   r1   r+   r5   s    r   get_parsed_sentz7ConllCorpusReader.parsed_sents.<locals>.get_parsed_sent   s    (({FCCCr   )r:   r;   rI   TREEr#   r   r=   )r+   r-   r1   r5   r_   s   ` `` r   parsed_sentszConllCorpusReader.parsed_sents   sw    dj$(DI666+K	D 	D 	D 	D 	D 	D 	D G(<(<===r   c                     |                      | j                   t          | j        |                     |                    S r9   )r:   SRLr   _get_srl_spansr=   r>   s     r   	srl_spanszConllCorpusReader.srl_spans   s5    dht*DKK,@,@AAAr   c                                             j         j         j         j                    j         fd}t          |                     |                    }|rt          |          }|S )Nc                 0                         |           S r9   )_get_srl_instances)rF   r1   r+   s    r   get_srl_instancesz:ConllCorpusReader.srl_instances.<locals>.get_srl_instances   s    **4===r   )	r:   r;   rI   r`   rc   r#   r   r=   r   )r+   r-   r1   flattenri   results   ` `   r   srl_instanceszConllCorpusReader.srl_instances   s    dj$(DItx@@@+K	> 	> 	> 	> 	> 	> *DKK,@,@AA 	/&v..Fr   c                                             j         j         j                    fd}t	          t          |                     |                              S )z
        :return: a list of word/tag/IOB tuples
        :rtype: list(tuple)
        :param fileids: the list of fileids that make up this corpus
        :type fileids: None or str or list
        c                 0                         |           S r9   _get_iob_wordsrE   s    r   get_iob_wordsz2ConllCorpusReader.iob_words.<locals>.get_iob_words       &&tV444r   )r:   r;   rI   rV   r   r   r=   r+   r-   r5   rq   s   ` ` r   	iob_wordszConllCorpusReader.iob_words   sf     	dj$(DJ777	5 	5 	5 	5 	5 	5 !G8L8L!M!MNNNr   c                                             j         j         j                    fd}t	          |                     |                    S )z
        :return: a list of lists of word/tag/IOB tuples
        :rtype: list(list)
        :param fileids: the list of fileids that make up this corpus
        :type fileids: None or str or list
        c                 0                         |           S r9   ro   rE   s    r   rq   z2ConllCorpusReader.iob_sents.<locals>.get_iob_words   rr   r   )r:   r;   rI   rV   r   r=   rs   s   ` ` r   	iob_sentszConllCorpusReader.iob_sents   s^     	dj$(DJ777	5 	5 	5 	5 	5 	5 }dkk'&:&:;;;r   c                 b     t           fd                     |d          D                       S )Nc                 D    g | ]\  }}t          |j        |           S ))r3   )StreamBackedCorpusView_read_grid_block)r   fileidencr+   s      r   
<listcomp>z,ConllCorpusReader._grids.<locals>.<listcomp>   s?       !VS 'vt/DsSSS  r   T)concatabspathsr>   s   ` r   r=   zConllCorpusReader._grids   sK        %)]]7D%A%A  
 
 	
r   c                     g }t          |          D ]}|                                }|s fd|                    d          D             }|d          j                            dd                   dk    r|d= |D ]:}t          |          t          |d                   k    rt          d|z            ;|                    |           |S )Nc                 D    g | ]}|                     j                  S r   )splitr*   )r   liner+   s     r   r~   z6ConllCorpusReader._read_grid_block.<locals>.<listcomp>   s'    GGGTDJJtx((GGGr   
r   r
   z
-DOCSTART-z"Inconsistent number of columns:
%s)read_blankline_blockstripr   r"   getlenr   append)r+   streamgridsblockrF   rows   `     r   r{   z"ConllCorpusReader._read_grid_block   s    )&11 	 	EKKMME GGGGU[[5F5FGGGD Awt|''334DDG  T Ts88s47||++$%JU%RSSS ,LLr   c                 D    |                      || j        d                   S )Nr
   )_get_columnr"   )r+   rF   s     r   r<   zConllCorpusReader._get_words   s    dl7&;<<<r   c           	                                 | j        d                   }r j        k    r fd|D             }t          t	                                | j        d                   |                    S )Nr   c                 <    g | ]}t          j        |          S r   r   r)   r   tr+   r5   s     r   r~   z7ConllCorpusReader._get_tagged_words.<locals>.<listcomp>   '    KKKQfa88KKKr   r
   r   r"   r)   listzipr+   rF   r5   pos_tagss   ` ` r   rD   z#ConllCorpusReader._get_tagged_words   s    ##D$,u*=>> 	Lf,,KKKKK(KKKHC((t|G/DEExPPQQQr   c                 8                          | j        d                   }r j        k    r fd|D             }t          t	                                | j        d                   |                      | j        d                                       S )Nr   c                 <    g | ]}t          j        |          S r   r   r   s     r   r~   z4ConllCorpusReader._get_iob_words.<locals>.<listcomp>   r   r   r
   r   r   r   s   ` ` r   rp   z ConllCorpusReader._get_iob_words   s    ##D$,u*=>> 	Lf,,KKKKK(KKKH  t|G'<==  t|G'<== 
 
 	
r   c                 0                          | j        d                   }                      | j        d                   }r j        k    r fd|D             }                      | j        d                   }t           j        g           g}t          |||          D ]\  }}	}
|
dk    rd\  }}n|
                    d          \  }}|||vrd}|dk    r ||d	                                         k    rd
}|dv r't          |          dk    r|	                                 |d
k    r@t          |g           }|d	         
                    |           |
                    |           |d	         
                    ||	f           |d         S )Nr
   r   c                 <    g | ]}t          j        |          S r   r   r   s     r   r~   z8ConllCorpusReader._get_chunked_words.<locals>.<listcomp>  r   r   r   O)r    -IBBO   r   )r   r"   r)   r   r$   r   r   labelr   popr   )r+   rF   r/   r5   r
   r   
chunk_tagsstackwordpos_tag	chunk_tagstate
chunk_type	new_chunks   `  `          r   rR   z$ConllCorpusReader._get_chunked_words  s     t|G'<==##D$,u*=>> 	Lf,,KKKKK(KKKH%%dDL,ABB
d&++,*-eXz*J*J 	. 	.&T7IC$+!zz&/ooc&:&:#
&:[+H+H||
eBioo.?.? ? ?}}Uq		|| R00	b	  +++Y'''"IdG_----Qxr   c           	                                | j        d                   }                      | j        d                   }r j        k    r fd|D             }                      | j        d                   }d}t          |||          D ]f\  }}	}
|dk    rd}|dk    rd	}|	dk    rd}	|	dk    rd	}	|
                    d
          \  }}|                    d          dz  }|| d|	 d| d| z  }g	  j                            |          }n=# t          t          f$ r)  j                            d j
         d| d          }Y nw xY w|s|                                D ]y}t          |          D ]g\  }}t          |t                    rMt          |          dk    r:t          |d         t                     r|d         |                                f||<   hz|S )Nr
   r   c                 <    g | ]}t          j        |          S r   r   r   s     r   r~   z6ConllCorpusReader._get_parsed_sent.<locals>.<listcomp>,  r   r   r   r   (z-LRB-)z-RRB-r   z ( z)    r   )r   r"   r)   r   r   countr&   
fromstringr   
IndexErrorr$   subtreesr!   r   r   r   r   r   )r+   rF   r1   r5   r
   r   
parse_tagstreestrr   r   	parse_tagleftrightr   subtreer   childs   `  `             r   r^   z"ConllCorpusReader._get_parsed_sent(  sL     t|G'<==##D$,u*=>> 	Lf,,KKKKK(KKKH%%dDL,@AA
*-eXz*J*J 	< 	<&T7Is{{s{{#~~!#~~!%OOC00MT5KK$$s*E$;;';;D;;E;;;GG	R#..w77DDJ' 	R 	R 	R#../P43C/P/Pg/P/P/PQQDDD	R  	?==?? ? ? )' 2 2 ? ?HAu"5$//?JJ!OO&uQx55 , ',Ah%>
? s   =D 7EEc                 
   | j         r5|                     || j        d         dz             }| j        d         dz   }n1|                     || j        d                   }| j        d         dz   }t          d |D                       }g }t	          |          D ]}|                     |||z             }g }g }	t          |          D ]\  }
}|                    d          \  }}|                    d          D ]}|r|	                    ||
f           t	          |                    d                    D ]5}|		                                \  }}|                    ||
dz   f|f           6|                    |           |S )z;
        list of list of (start, end), tag) tuples
        r   r   r   c                     g | ]
}|d k    |S )r   r   )r   ps     r   r~   z4ConllCorpusReader._get_srl_spans.<locals>.<listcomp>Z  s    ;;;q!s(((((r   r   r   r   )
r%   r   r"   r   ranger!   r   r   r   r   )r+   rF   
predicates	start_col	num_preds	spanlistsr   colspanlistr   wordnumsrl_tagr   r   tagstarts                   r   rd   z ConllCorpusReader._get_srl_spansM  s    % 	0))$U0Ca0GHHJU+a/II))$U0CDDJU+a/I ;;J;;;<<		y!! 	' 	'A""4Q77CHE$-cNN A A  'c 2 2u::c?? 5 5C 5c7^444u{{3//00 A AA#(99;;LS%OOeWq[%93$?@@@@A X&&&&r   c           
         |                      ||          }|                     |          }| j        rF|                     || j        d         dz             }|                     || j        d                   }n4|                     || j        d                   }d gt          |          z  }t          |          }t          |          D ]t\  }}	|	dk    r|D ]'}
|
D ] \  \  }}}|t          ||          v r|dv r n!& nt          d|	z            |
                    t          |||	||         |
                     u|S )Nr   r   r   VzC-VzNo srl column found for %r)r^   rd   r%   r   r"   r   ConllSRLInstanceListr!   r   r   r   ConllSRLInstance)r+   rF   r1   r   r   r   rolesets	instancesr   	predicater   r   endr   s                 r   rh   z$ConllCorpusReader._get_srl_instancesm  s   $$T;77''--	% 	0))$U0Ca0GHHJ''dl5.ABBHH))$U0CDDJvJ/H(..	"+J"7"7 	 	GYC & K K)1  %LUC#%s"3"333|8K8K !=	!IJJJ w	8G;LhWW    r   c                 F    |D ]}|| j         vrt          d|z            d S )Nz)This corpus does not contain a %s column.)r"   r   )r+   r.   r7   s      r   r:   zConllCorpusReader._require  sG    % 	 	J-- BZO   .	 	r   c                 X      fdt          t                               D             S )Nc                 ,    g | ]}|                  S r   r   )r   r   column_indexrF   s     r   r~   z1ConllCorpusReader._get_column.<locals>.<listcomp>  s"    @@@!Q%@@@r   )r   r   )rF   r   s   ``r   r   zConllCorpusReader._get_column  s/    @@@@@uSYY/?/?@@@@r   r9   )NN)NNN)NNT)%__name__
__module____qualname____doc__r;   rI   r`   rV   NErc   IGNOREr   r   r(   r
   r@   rK   rN   rX   r[   ra   re   rl   rt   rw   r=   r{   r<   rD   rp   rR   r^   rd   rh   r:   staticmethodr   r   r   r   r	   r	      sS        : E
CDE	B
CF 3eRf=L !   BQ Q Q Q> > > >R R R R? ? ? ?S S S S@ @ @ @> > > >B B B B   O O O O< < < <$	
 	
 	
 	
  6= = =R R R R

 

 

 

       D# # # #J  @  H   A A \A A Ar   r	   c                   $    e Zd ZdZd Zd Zd ZdS )r   z|
    An SRL instance from a CoNLL corpus, which identifies and
    providing labels for the arguments of a single verb.
    c           	      X   g | _         	 || _        	 || _        || _        g | _        	 || _        	 || _        	 |                                | _        	 |D ]V\  \  }}}|dv r,| xj         t          t          ||                    z  c_         8| j                            ||f|f           Wd S )Nr   )verb	verb_head	verb_stemroleset	argumentstagged_spansr   leavesr
   r   r   r   )	r+   r   r   r   r   r   r   r   r   s	            r   r(   zConllSRLInstance.__init__  s    		/
 #	"
 #	F
 )	* 	G[[]]
	 ". 	; 	;LUC#l""		T%s"3"3444			%%s|S&9::::		; 	;r   c                 x    t          | j                  dk    rdnd}d| j        t          | j                  |fz  S )Nr   sr   z,<ConllSRLInstance for %r with %d argument%s>)r   r   r   )r+   plurals     r   __repr__zConllSRLInstance.__repr__  sD     DN++q00b=^S00&9
 	
r   c                     d                      fd j        D                       }d|d j        d}d}t           j                  D ]a\  }}t          |t                    r|d         } j        D ]!\  \  }}}||k    r|d|z  z  }||k    r|d	z  }"| j        v rd
|z  }||dz   z  }b|t          j	        |
                    dd          dd          z   S )Nr   c              3   >   K   | ]}j         |         d          V  dS )r   N)r
   )r   r   r+   s     r   	<genexpr>z*ConllSRLInstance.pprint.<locals>.<genexpr>  s.      ??4:a=+??????r   zSRL for z (stem=z):
r   r   z[%s z] z<<%s>>z ]]z    )initial_indentsubsequent_indent)joinr   r   r!   r
   r   tupler   textwrapfillreplace)	r+   verbstrhdrr   r   r   r   r   argids	   `        r   pprintzConllSRLInstance.pprint  s   ((????TY?????AAA4>AAA ,, 
	 
	GAt$&& Aw'+~  #e::%'A88IADI~~$OAAX]IIdC  6
 
 
 
 	
r   N)r   r   r   r   r(   r   r   r   r   r   r   r     sL         (; (; (;T
 
 

 
 
 
 
r   r   c                   .    e Zd ZdZd	dZd Zd
dZd ZdS )r   z0
    Set of instances for a single sentence
    r   c                 J    || _         t                              | |           d S r9   )r   r   r(   )r+   r   r   s      r   r(   zConllSRLInstanceList.__init__  s#    	dI&&&&&r   c                 *    |                                  S r9   )r   )r+   s    r   __str__zConllSRLInstanceList.__str__  s    {{}}r   Fc                    | D ]!}|j         | j         k    rt          d          "|r]| j                                         }d gt          |          z  }dgt          |          z  }|                     | j         d|||           d}t          t          |                    D ]}|rJ|d||         z  z  }|d||         z  z  }|dt          ||                             d                    z  z  }| D ]}||j        k    r|d|j	        z  z  } n	|ddz  z  }| D ]7}d}|j
        D ]#\  \  }	}
}||	k    rd	| | }||
d
z
  k    r|dz  }$|d|z  z  }8|dz  }|S )NzTree mismatch!r   r   r   z%-20s z%-8s z
%15s*%-8s r   r   r   r   z%-12s r   )r   r   r   r   _tree2conllr   r   r   r   r   r   )r+   include_treeinstr
   r   syntr   r   argstrr   r   r   s               r   r   zConllSRLInstanceList.pprint  s    	3 	3DyDI%% !1222 &  	=I$$&&E&3u::%C53u::%DTY5#t<<<s5zz"" 	 	A >Xa((Ws1v%%\E$q'--*<*<$=$===  $ $&&DN22AE ' X^# ' '+/+< & &'LUC%Ezz!4U!4F!4!4S1W~~#X&&IAAr   c                 T   t          |t                    sJ t          |          dk    rKt          |d         t                    r0|                                ||<   ||         |d         k    sJ |dz   S t          |          dk    rLt          |d         t
                    r1t          |d                   dk    sJ |d         \  ||<   ||<   |dz   S d|                                 ||          ||<   |D ]}|                     |||||          }||dz
  xx         dz  cc<   |S )Nr   r   r   r   r   )r   r   r   r   r   r   r  )r+   r   r   r
   r   r	  r   s          r   r  z ConllSRLInstanceList._tree2conll  s>   $%%%%%t99>>ja#66>::<<CL>T!W,,,,Q;YY!^^
47E : :^tAw<<1$$$$)-a&CL#g,Q;=

=d7m==DM M M**5'5#tLL1$Nr   N)r   )F)r   r   r   r   r(   r  r   r  r   r   r   r   r     se         ' ' ' '  & & & &P    r   r   c                       e Zd ZdZ	 ddZdS )ConllChunkCorpusReaderz`
    A ConllCorpusReader whose data file contains three columns: words,
    pos, and chunk.
    r   Nc           
      J    t                               | ||d||||           d S )N)r
   r   r   )r/   r3   r5   r6   )r	   r(   )r+   r,   r-   r/   r3   r5   r6   s          r   r(   zConllChunkCorpusReader.__init__7  sC     	""%# 	# 		
 		
 		
 		
 		
r   )r   NN)r   r   r   r   r(   r   r   r   r  r  1  s9          SW
 
 
 
 
 
r   r  )r   r   nltk.corpus.reader.apinltk.corpus.reader.utilnltk.tagr   	nltk.treer   	nltk.utilr   r   r'   r	   r   r   r   r  r   r   r   <module>r     sB     $ $ $ $ % % % %             0 0 0 0 0 0 0 0DA DA DA DA DA DA DA DANK
 K
 K
 K
 K
 K
 K
 K
\C C C C C4 C C CL
 
 
 
 
. 
 
 
 
 
r   