
    '[f$                         d Z ddlZddlZddlZddlmZ ddlT ddlm	Z	 ddl
T ddlT ddlmZ  G d de          Z G d	 d
e          ZdS )zN
A reader for corpora that contain chunked (and optionally tagged)
documents.
    Ntagstr2tree)*)BracketParseCorpusReader)Treec                       e Zd ZdZde edd          eddfdZdd	Zdd
Z	ddZ
ddZddZddZddZddZddZd ZdS )ChunkedCorpusReadera&  
    Reader for chunked (and optionally tagged) corpora.  Paragraphs
    are split using a block reader.  They are then tokenized into
    sentences using a sentence tokenizer.  Finally, these sentences
    are parsed into chunk trees using a string-to-chunktree conversion
    function.  Each of these steps can be performed using a default
    function or a custom function.  By default, paragraphs are split
    on blank lines; sentences are listed one per line; and sentences
    are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
     
T)gapsutf8Nc	                 V    t                               | |||           ||||f| _        dS )z
        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        N)CorpusReader__init___cv_args)	selfrootfileids	extensionstr2chunktreesent_tokenizerpara_block_readerencodingtagsets	            N/var/www/piapp/venv/lib/python3.11/site-packages/nltk/corpus/reader/chunked.pyr   zChunkedCorpusReader.__init__&   s;     	dD'8<<<&8I6R	A 	A    c                 b     t           fd                     |d          D                       S )z~
        :return: the given file(s) as a list of words
            and punctuation symbols.
        :rtype: list(str)
        c           
      D    g | ]\  }}t          ||d d d d gj        R  S )r   ChunkedCorpusViewr   .0fencr   s      r   
<listcomp>z-ChunkedCorpusReader.words.<locals>.<listcomp>A   I       Q "!S!Q1Et}EEE  r   Tconcatabspathsr   r   s   ` r   wordszChunkedCorpusReader.words:   sK         $gt < <  
 
 	
r   c                 b     t           fd                     |d          D                       S )z
        :return: the given file(s) as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        :rtype: list(list(str))
        c           
      D    g | ]\  }}t          ||d dd d gj        R  S r      r   r!   s      r   r%   z-ChunkedCorpusReader.sents.<locals>.<listcomp>O   r&   r   Tr'   r*   s   ` r   sentszChunkedCorpusReader.sentsG   K         $gt < <  
 
 	
r   c                 b     t           fd                     |d          D                       S )z
        :return: the given file(s) as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as lists of word strings.
        :rtype: list(list(list(str)))
        c           
      D    g | ]\  }}t          ||d ddd gj        R  S r.   r   r!   s      r   r%   z-ChunkedCorpusReader.paras.<locals>.<listcomp>]   r&   r   Tr'   r*   s   ` r   paraszChunkedCorpusReader.parasU   r1   r   c                 f     t           fd                     |d          D                       S )z
        :return: the given file(s) as a list of tagged
            words and punctuation symbols, encoded as tuples
            ``(word,tag)``.
        :rtype: list(tuple(str,str))
        c           
      J    g | ]\  }}t          ||d dddgj        R di S r/   r   target_tagsetr   r"   r#   r$   r   r   s      r   r%   z4ChunkedCorpusReader.tagged_words.<locals>.<listcomp>k   b        Q "sAq!Q)-  FL   r   Tr'   r   r   r   s   ` `r   tagged_wordsz ChunkedCorpusReader.tagged_wordsc   S          !%gt < <	  
 
 	
r   c                 f     t           fd                     |d          D                       S )z
        :return: the given file(s) as a list of
            sentences, each encoded as a list of ``(word,tag)`` tuples.

        :rtype: list(list(tuple(str,str)))
        c           
      J    g | ]\  }}t          ||d d ddgj        R di S r7   r   r9   s      r   r%   z4ChunkedCorpusReader.tagged_sents.<locals>.<listcomp>{   r:   r   Tr'   r;   s   ` `r   tagged_sentsz ChunkedCorpusReader.tagged_sentss   r=   r   c                 f     t           fd                     |d          D                       S )z
        :return: the given file(s) as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as lists of ``(word,tag)`` tuples.
        :rtype: list(list(list(tuple(str,str))))
        c           
      J    g | ]\  }}t          ||d d d dgj        R di S r7   r   r9   s      r   r%   z4ChunkedCorpusReader.tagged_paras.<locals>.<listcomp>   r:   r   Tr'   r;   s   ` `r   tagged_parasz ChunkedCorpusReader.tagged_paras   r=   r   c                 f     t           fd                     |d          D                       S )av  
        :return: the given file(s) as a list of tagged
            words and chunks.  Words are encoded as ``(word, tag)``
            tuples (if the corpus has tags) or word strings (if the
            corpus has no tags).  Chunks are encoded as depth-one
            trees over ``(word,tag)`` tuples or word strings.
        :rtype: list(tuple(str,str) and Tree)
        c           
      J    g | ]\  }}t          ||d ddd gj        R di S r7   r   r9   s      r   r%   z5ChunkedCorpusReader.chunked_words.<locals>.<listcomp>   r:   r   Tr'   r;   s   ` `r   chunked_wordsz!ChunkedCorpusReader.chunked_words   S          !%gt < <	  
 
 	
r   c                 f     t           fd                     |d          D                       S )a6  
        :return: the given file(s) as a list of
            sentences, each encoded as a shallow Tree.  The leaves
            of these trees are encoded as ``(word, tag)`` tuples (if
            the corpus has tags) or word strings (if the corpus has no
            tags).
        :rtype: list(Tree)
        c           
      J    g | ]\  }}t          ||d d dd gj        R di S r7   r   r9   s      r   r%   z5ChunkedCorpusReader.chunked_sents.<locals>.<listcomp>   r:   r   Tr'   r;   s   ` `r   chunked_sentsz!ChunkedCorpusReader.chunked_sents   rG   r   c                 f     t           fd                     |d          D                       S )ao  
        :return: the given file(s) as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as a shallow Tree.  The leaves of these
            trees are encoded as ``(word, tag)`` tuples (if the corpus
            has tags) or word strings (if the corpus has no tags).
        :rtype: list(list(Tree))
        c           
      J    g | ]\  }}t          ||d d d d gj        R di S )r/   r8   r   r9   s      r   r%   z5ChunkedCorpusReader.chunked_paras.<locals>.<listcomp>   r:   r   Tr'   r;   s   ` `r   chunked_parasz!ChunkedCorpusReader.chunked_paras   rG   r   c                 4    d t          |          D             S )Nc                 ,    g | ]}t          |          S  r   )r"   ts     r   r%   z3ChunkedCorpusReader._read_block.<locals>.<listcomp>   s    EEE1AEEEr   )read_blankline_block)r   streams     r   _read_blockzChunkedCorpusReader._read_block   s    EE(<V(D(DEEEEr   )NNN)__name__
__module____qualname____doc__r   RegexpTokenizerrR   r   r+   r0   r4   r<   r@   rC   rF   rJ   rM   rT   rP   r   r   r	   r	      s!       	 	 !&t$777.A A A A(
 
 
 

 
 
 

 
 
 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
$
 
 
 
$
 
 
 
$F F F F Fr   r	   c                   &    e Zd Z	 	 ddZd Zd ZdS )r    Nc                     t                               | ||           || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        d S )N)r   )StreamBackedCorpusViewr   _tagged_group_by_sent_group_by_para_chunked_str2chunktree_sent_tokenizer_para_block_reader_source_tagset_target_tagset)r   fileidr   taggedgroup_by_sentgroup_by_parachunkedr   r   r   source_tagsetr8   s               r   r   zChunkedCorpusView.__init__   sk     	''fx'HHH+++-"3++r   c                    g }|                      |          D ]}g }| j                            |          D ]}|                     || j        | j                  }| j        s|                     |          }| j        s|	                                }| j
        r|                    |           x|                    |           | j        r|                    |           |                    |           |S )N)rl   r8   )rd   rc   tokenizerb   re   rf   r^   _untagra   leavesr_   appendextendr`   )r   rS   blockpara_strparasent_strsents          r   
read_blockzChunkedCorpusView.read_block   s   //77 	# 	#HD 099(CC & &**"&"5"&"5 +   | -;;t,,D } );;==D & &KK%%%%KK%%%% " #T""""T"""" r   c                     t          |          D ]_\  }}t          |t                    r|                     |           0t          |t                    r|d         ||<   Qt          d          |S )Nr   z"expected child to be Tree or tuple)	enumerate
isinstancer   ro   tuple
ValueError)r   treeichilds       r   ro   zChunkedCorpusView._untag	  s{    !$ 	G 	GHAu%&& GE""""E5)) G(Q !EFFFr   rU   )rV   rW   rX   r   rx   ro   rP   r   r   r    r       sO         , , , ,2     D    r   r    )rY   codecsos.pathosnltk
nltk.chunkr   nltk.corpus.reader.api nltk.corpus.reader.bracket_parser   nltk.corpus.reader.utilnltk.tokenize	nltk.treer   r   r	   r]   r    rP   r   r   <module>r      s    
    " " " " " " $ $ $ $ E E E E E E % % % %          pF pF pF pF pF, pF pF pFfD D D D D. D D D D Dr   