
    '[f                     ~    d dl mZ d dlmZmZmZ d dlmZmZ d dl	m
Z
mZ  G d de          Z G d de          Zd	S )
    )CorpusReader)StreamBackedCorpusViewconcatread_alignedsent_block)RegexpTokenizerWhitespaceTokenizer)AlignedSent	Alignmentc                   b    e Zd ZdZd e             edd          edfdZdd	Zdd
Z	ddZ
dS )AlignedCorpusReaderz
    Reader for corpora of word-aligned sentences.  Tokens are assumed
    to be separated by whitespace.  Sentences begin on separate lines.
    /
T)gapslatin1c                 l    t          j        | |||           || _        || _        || _        || _        dS )a  
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        N)r   __init___sep_word_tokenizer_sent_tokenizer_alignedsent_block_reader)selfrootfileidssepword_tokenizersent_tokenizeralignedsent_block_readerencodings           N/var/www/piapp/venv/lib/python3.11/site-packages/nltk/corpus/reader/aligned.pyr   zAlignedCorpusReader.__init__   s@    ( 	dD'8<<<	--)A&&&    Nc                 b     t           fd                     |d          D                       S )z~
        :return: the given file(s) as a list of words
            and punctuation symbols.
        :rtype: list(str)
        c                 ^    g | ])\  }}t          ||d d j        j        j                  *S )FAlignedSentCorpusViewr   r   r   .0fileidencr   s      r   
<listcomp>z-AlignedCorpusReader.words.<locals>.<listcomp>9   sW        "VS &((2   r    Tr   abspathsr   r   s   ` r   wordszAlignedCorpusReader.words2   sM         &*]]7D%A%A  
 
 	
r    c                 b     t           fd                     |d          D                       S )z
        :return: the given file(s) as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        :rtype: list(list(str))
        c                 ^    g | ])\  }}t          ||d dj        j        j                  *S )FTr#   r%   s      r   r)   z-AlignedCorpusReader.sents.<locals>.<listcomp>O   sW        "VS &((2   r    Tr*   r,   s   ` r   sentszAlignedCorpusReader.sentsG   sM         &*]]7D%A%A  
 
 	
r    c                 b     t           fd                     |d          D                       S )zp
        :return: the given file(s) as a list of AlignedSent objects.
        :rtype: list(AlignedSent)
        c                 ^    g | ])\  }}t          ||d d j        j        j                  *S )Tr#   r%   s      r   r)   z5AlignedCorpusReader.aligned_sents.<locals>.<listcomp>c   sW        "VS &((2   r    Tr*   r,   s   ` r   aligned_sentsz!AlignedCorpusReader.aligned_sents]   sM    
     &*]]7D%A%A  
 
 	
r    )N)__name__
__module____qualname____doc__r   r   r   r   r-   r0   r3    r    r   r   r      s          **,,&t$777!7B B B B4
 
 
 
*
 
 
 
,
 
 
 
 
 
r    r   c                       e Zd ZdZd Zd ZdS )r$   z
    A specialized corpus view for aligned sentences.
    ``AlignedSentCorpusView`` objects are typically created by
    ``AlignedCorpusReader`` (not directly by nltk users).
    c                 z    || _         || _        || _        || _        || _        t          j        | ||           d S )N)r   )_aligned_group_by_sentr   r   r   r   r   )r   corpus_filer   alignedgroup_by_sentr   r   r   s           r   r   zAlignedSentCorpusView.__init__y   sI      +--)A&'kHMMMMMMr    c                       fd                      |          D             } j        r;t          j        d                    |d                             |d<   t          | g}n j        r
|d         g}n|d         }|S )Nc                 ~    g | ]9}j                             |          D ]}j                            |          :S r8   )r   tokenizer   )r&   alignedsent_strsent_strr   s      r   r)   z4AlignedSentCorpusView.read_block.<locals>.<listcomp>   sb     
 
 
 099/JJ
 
   ))(33
 
 
 
r        r   )r   r;   r
   
fromstringjoinr	   r<   )r   streamblocks   `  r   
read_blockz AlignedSentCorpusView.read_block   s    
 
 
 
#'#A#A&#I#I
 
 

 = 	 +q"" E!H !%()EE  	1XJEE!HEr    N)r4   r5   r6   r7   r   rK   r8   r    r   r$   r$   r   s?         N N N"    r    r$   N)nltk.corpus.reader.apir   nltk.corpus.reader.utilr   r   r   nltk.tokenizer   r   nltk.translater	   r
   r   r$   r8   r    r   <module>rP      s    0 / / / / /         
 ? > > > > > > > 1 1 1 1 1 1 1 1]
 ]
 ]
 ]
 ]
, ]
 ]
 ]
@( ( ( ( (2 ( ( ( ( (r    