
    '[f                         d dl Z d dlmZmZ d dlmZmZmZ d dlm	Z	 d Z
 G d de          Zd Zd	 Zed
k    r e             dS dS )    N)CorpusReaderSyntaxCorpusReader)FileSystemPathPointerfind_corpus_fileidsread_blankline_block)DependencyGraphc                 @    d                     d | D                       S )N/c              3   >   K   | ]}|d          dk    |d          V  dS )r   EOSN .0ms     K/var/www/piapp/venv/lib/python3.11/site-packages/nltk/corpus/reader/knbc.py	<genexpr>z<lambda>.<locals>.<genexpr>   s.      -T-Tqademmadmmmm-T-T    joinmorphss    r   <lambda>r      s!    SXX-T-TF-T-T-T%T%T r   c                   8    e Zd ZdZdefdZd Zd Zd	dZd Z	dS )
KNBCorpusReadera  
    This class implements:
      - ``__init__``, which specifies the location of the corpus
        and a method for detecting the sentence blocks in corpus files.
      - ``_read_block``, which reads a block from the input stream.
      - ``_word``, which takes a block and returns a list of list of words.
      - ``_tag``, which takes a block and returns a list of list of tagged
        words.
      - ``_parse``, which takes a block and returns a list of parsed
        sentences.

    The structure of tagged words:
      tagged_word = (word(str), tags(tuple))
      tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)

    Usage example

    >>> from nltk.corpus.util import LazyCorpusLoader
    >>> knbc = LazyCorpusLoader(
    ...     'knbc/corpus1',
    ...     KNBCorpusReader,
    ...     r'.*/KN.*',
    ...     encoding='euc-jp',
    ... )

    >>> len(knbc.sents()[0])
    9

    utf8c                 B    t          j        | |||           || _        dS )z
        Initialize KNBCorpusReader
        morphs2str is a function to convert morphlist to str for tree representation
        for _parse()
        N)r   __init__
morphs2str)selfrootfileidsencodingr   s        r   r   zKNBCorpusReader.__init__7   s&     	#D$BBB$r   c                      t          |          S N)r   )r   streams     r   _read_blockzKNBCorpusReader._read_block@   s    #F+++r   c                     g }|                                 D ]Y}t          j        d|          sB|                                                    d          }|                    |d                    Z|S )NEOS|\*|\#|\+ r   )
splitlinesrematchstripsplitappend)r   treslinecellss        r   _wordzKNBCorpusReader._wordD   si    LLNN 	% 	%D8OT22 %

**3//

58$$$
r   Nc           	          g }|                                 D ]v}t          j        d|          s_|                                                    d          }|                    |d         d                    |dd                    f           w|S )Nr(   r)   r      )r*   r+   r,   r-   r.   r/   r   )r   r0   tagsetr1   r2   r3   s         r   _tagzKNBCorpusReader._tagO   s    LLNN 	< 	<D8OT22 <

**3//

E!HchhuQRRy&9&9:;;;
r   c                    t                      }d}|                                D ]j}|d         dv r|                                                    dd          }t	          j        d|d                   }|J |j        |         }|                    ||                    d          g d           t          |                    d                    }|d	k    r||_
        n&|j        |         d
                             |           |dz  }|d         dk    ru|                                                    d          }|d         d                    |dd                    f}	|j        |dz
           d                             |	           l| j        r:|j                                        D ] }|                     |d                   |d<   !|                                S )Nr   z*+r)      z([\-0-9]*)([ADIP])r6      )addressrelworddeps#r>   )r   r*   r-   r.   r+   r,   nodesupdategroupintr    r/   r   r   valuestree)
r   r0   dgir2   r3   r   node
dep_parentmorphs
             r   _parsezKNBCorpusReader._parseZ   s   LLNN 	6 	6DAw$ 

**322H2E!H==}}}x{!''!**bIIJJJ __
##"BGGHZ(077:::QaC

**3//a#((59"5"55Q'..u555? 	=)) = =#tF|<<Vwwyyr   r$   )
__name__
__module____qualname____doc___morphs2str_defaultr   r&   r4   r8   rM   r   r   r   r   r      s{         < 06BU % % % %, , ,  	 	 	 	" " " " "r   r   c                  f   dd l } ddlm} | j                            d          }d t          t          |          d          D             }d } |dt          t          ||          d	
          }t          |
                                d d                    t          d                    |                                d d                              t          d                    d |                                d d         D                                  d |_        t          d                    d |                                d d         D                                  t          d                    d |                                dd         D                                  d S )Nr   LazyCorpusLoaderzcorpora/knbc/corpus1c                 <    g | ]}t          j        d |          |S )z\d\-\d\-[\d]+\-[\d]+)r+   search)r   fs     r   
<listcomp>zdemo.<locals>.<listcomp>   s;       9,a00	  r   z.*c                     |                      d          }|d         t          |d                   t          |d                   t          |d                   fS )N-r   r6   r;   r:   )r.   rE   )xr3   s     r   _knbc_fileids_sortz demo.<locals>._knbc_fileids_sort   sB    a#eAh--U1XE!HFFr   knbc/corpus1)keyeuc-jpr"   
    d   z

c              3   4   K   | ]}t          |          V  d S r$   )strr   rG   s     r   r   zdemo.<locals>.<genexpr>   s(      DDDc$iiDDDDDDr   r;   c                 f    d                     d | D                                           d          S )Nr
   c              3      K   | ]K}|d          dk    d                     |d          |d                             d          d                   V  LdS )r   r   z{}({})r6   r)   r;   Nformatr.   r   s     r   r   z)demo.<locals>.<lambda>.<locals>.<genexpr>   sX       . .67QqTU]]!adjjooa011]]]]. .r   zutf-8)r   encoder   s    r   r   zdemo.<locals>.<lambda>   s>    SXX . .;A. . . & &fWoo r   c              3       K   | ]	}d |z  V  
dS )z%sNr   rg   s     r   r   zdemo.<locals>.<genexpr>   s&      FFddTkFFFFFFr   
c              3   T   K   | ]#}d                      d |D                       V  $dS )r)   c              3      K   | ]?}d                      |d         |d                             d          d                   V  @dS )z{}/{}r   r6   r)   r;   Nrj   )r   ws     r   r   z!demo.<locals>.<genexpr>.<genexpr>   sG      LL!W^^AaD!A$**S//!*<==LLLLLLr   Nr   )r   sents     r   r   zdemo.<locals>.<genexpr>   sQ       
 
 HHLLtLLLLL
 
 
 
 
 
r   )nltknltk.corpus.utilrU   datafindr   r   r   sortedprintr!   r   wordsparsed_sentsr   tagged_sents)rs   rU   r    r!   r]   knbcs         r   demor}      s   KKK1111119>>011D $%:4%@%@$GG  GG G G w.///	  D 
$,,.."
	"''$**,,tt$
%
%&&&	&++DDD,=,=,?,?,CDDD
D
DEEE DO 
&++FFd.?.?.A.A"1".EFFF
F
FGGG			 
 
))++AaC0
 
 
 	
 	
    r   c                     ddl m}   | dt          dd          }t          |                                d         t
                    sJ t          |                                d         d         t
                    sJ t          |                                d         t                    sJ t          |	                                d         d         t                    sJ d S )Nr   rT   r^   z.*/KN.*r`   ra   )
rt   rU   r   
isinstancery   rf   sentstagged_wordstupler{   )rU   r|   s     r   testr      s    111111h  D djjll1os+++++djjll1oa(#.....d''))!,e44444d''))!,Q/7777777r   __main__)r+   nltk.corpus.reader.apir   r   nltk.corpus.reader.utilr   r   r   
nltk.parser   rR   r   r}   r   rN   r   r   r   <module>r      s    
			 C C C C C C C C         
 ' & & & & & UT d d d d d( d d dX' ' 'T
8 
8 
8 zDFFFFF r   