
    '[f18                         d Z ddlZddlZddlmZ ddlmZmZ ddlm	Z	 d Z
 G d de	          Z G d	 d
          Z G d d          Z G d de          ZdS )z9
A reader for corpora whose documents are in MTE format.
    N)reduce)TaggedCorpusReaderconcat)XMLCorpusViewc                 .    |                      ||          S N)findall)rootpathnss      J/var/www/piapp/venv/lib/python3.11/site-packages/nltk/corpus/reader/mte.pyxpathr      s    <<b!!!    c                   "    e Zd ZdZddZddZdS )MTECorpusViewz0
    Class for lazy viewing the MTE Corpus.
    Nc                 4    t          j        | |||           d S r   )r   __init__)selffileidtagspecelt_handlers       r   r   zMTECorpusView.__init__   s    tVWkBBBBBr   c                 h    t          t          d t          j        | |||                              S )Nc                 
    | d uS r    xs    r   <lambda>z*MTECorpusView.read_block.<locals>.<lambda>   
    !4- r   )listfilterr   
read_block)r   streamr   r   s       r   r!   zMTECorpusView.read_block   s;    ''(vwLL 
 
 	
r   r   )NN)__name__
__module____qualname____doc__r   r!   r   r   r   r   r      sJ         C C C C
 
 
 
 
 
r   r   c                   2   e Zd ZdZdddZdZdZdZdZd	Z	d
 Z
ed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zd Zd Zd Zd Zd Zd Zd Zd Zd ZdS )MTEFileReaderz
    Class for loading the content of the multext-east corpus. It
    parses the xml files and does some tag-filtering depending on the
    given method parameters.
    zhttps://www.tei-c.org/ns/1.0z%https://www.w3.org/XML/1998/namespace)teixmlz{https://www.tei-c.org/ns/1.0}z'{https://www.w3.org/XML/1998/namespace}zTEI/text/body/div/div/p/s/(w|c)zTEI/text/body/div/div/p/szTEI/text/body/div/div/pc                     || _         d S r   )_MTEFileReader__file_path)r   	file_paths     r   r   zMTEFileReader.__init__2   s    $r   c                     |j         S r   )textclseltcontexts      r   	_word_eltzMTEFileReader._word_elt5   s	    xr   c                 H      fdt          |d j                  D             S )Nc                 <    g | ]}                     |d           S r   )r4   .0wr1   s     r   
<listcomp>z+MTEFileReader._sent_elt.<locals>.<listcomp>;   '    HHH1a&&HHHr   *r   r   r0   s   `  r   	_sent_eltzMTEFileReader._sent_elt9   +    HHHHc30G0GHHHHr   c                 H      fdt          |d j                  D             S )Nc                 <    g | ]}                     |d           S r   )r>   r8   sr1   s     r   r:   z+MTEFileReader._para_elt.<locals>.<listcomp>?   r;   r   r<   r=   r0   s   `  r   	_para_eltzMTEFileReader._para_elt=   r?   r   c                 >   d|j         vr	|j        dfS | j        dk    r| j        dk    r|j        |j         d         fS | j        dk    r7| j        dk    r,|j        t                              |j         d                   fS t          j        dt          j        dd| j                  z   dz             }|	                    |j         d                   rK| j        dk    r|j        |j         d         fS |j        t                              |j         d                   fS d S )	Nana msd	universal^-.z.*$)
attribr/   _MTEFileReader__tags_MTEFileReader__tagsetMTETagConvertermsd_to_universalrecompilesubmatch)r1   r2   r3   tagss       r   _tagged_word_eltzMTEFileReader._tagged_word_eltA   s   
""Hb>!: 5 5Hcj/00Z2#,+"="=Ho>>sz%?PQQRR:cBF3SZ$@$@@5HIIDzz#*U+,, 	<5((Hcj&788 '88E9JKK 
 tr   c                      t          t          d  fdt          |d j                  D                                 S )Nc                 
    | d uS r   r   r   s    r   r   z0MTEFileReader._tagged_sent_elt.<locals>.<lambda>[   r   r   c                 <    g | ]}                     |d           S r   )rW   r7   s     r   r:   z2MTEFileReader._tagged_sent_elt.<locals>.<listcomp>\   )    PPP1%%a..PPPr   r<   r   r    r   r   r0   s   `  r   _tagged_sent_eltzMTEFileReader._tagged_sent_eltW   M    ''PPPPc38O8OPPP 
 
 	
r   c                      t          t          d  fdt          |d j                  D                                 S )Nc                 
    | d uS r   r   r   s    r   r   z0MTEFileReader._tagged_para_elt.<locals>.<lambda>d   r   r   c                 <    g | ]}                     |d           S r   )r]   rB   s     r   r:   z2MTEFileReader._tagged_para_elt.<locals>.<listcomp>e   r[   r   r<   r\   r0   s   `  r   _tagged_para_eltzMTEFileReader._tagged_para_elt`   r^   r   c                 N    d|j         vr	|j        dfS |j        |j         d         fS )NlemmarG   )rM   r/   r0   s      r   _lemma_word_eltzMTEFileReader._lemma_word_elti   s/    #*$$Hb>!Hcj122r   c                 H      fdt          |d j                  D             S )Nc                 <    g | ]}                     |d           S r   )re   r7   s     r   r:   z1MTEFileReader._lemma_sent_elt.<locals>.<listcomp>r   )    NNN##At,,NNNr   r<   r=   r0   s   `  r   _lemma_sent_eltzMTEFileReader._lemma_sent_eltp   +    NNNNeCcf6M6MNNNNr   c                 H      fdt          |d j                  D             S )Nc                 <    g | ]}                     |d           S r   )ri   rB   s     r   r:   z1MTEFileReader._lemma_para_elt.<locals>.<listcomp>v   rh   r   r<   r=   r0   s   `  r   _lemma_para_eltzMTEFileReader._lemma_para_eltt   rj   r   c                 V    t          | j        t          j        t          j                  S r   )r   r,   r(   	word_pathr4   r   s    r   wordszMTEFileReader.wordsx   $    m5}7N
 
 	
r   c                 V    t          | j        t          j        t          j                  S r   )r   r,   r(   	sent_pathr>   rp   s    r   sentszMTEFileReader.sents}   rr   r   c                 V    t          | j        t          j        t          j                  S r   )r   r,   r(   	para_pathrD   rp   s    r   paraszMTEFileReader.paras   rr   r   c                 V    t          | j        t          j        t          j                  S r   )r   r,   r(   ro   re   rp   s    r   lemma_wordszMTEFileReader.lemma_words   $    m5}7T
 
 	
r   c                     |t           _        |t           _        t          | j        t           j        t           j                  S r   )r(   rO   rN   r   r,   ro   rW   r   tagsetrV   s      r   tagged_wordszMTEFileReader.tagged_words   4    !'#m5}7U
 
 	
r   c                 V    t          | j        t          j        t          j                  S r   )r   r,   r(   rt   ri   rp   s    r   lemma_sentszMTEFileReader.lemma_sents   r{   r   c                     |t           _        |t           _        t          | j        t           j        t           j                  S r   )r(   rO   rN   r   r,   rt   r]   r}   s      r   tagged_sentszMTEFileReader.tagged_sents   r   r   c                 V    t          | j        t          j        t          j                  S r   )r   r,   r(   rw   rm   rp   s    r   lemma_paraszMTEFileReader.lemma_paras   r{   r   c                     |t           _        |t           _        t          | j        t           j        t           j                  S r   )r(   rO   rN   r   r,   rw   rb   r}   s      r   tagged_paraszMTEFileReader.tagged_paras   r   r   N)r#   r$   r%   r&   r   tag_nsxml_nsro   rt   rw   r   classmethodr4   r>   rD   rW   r]   rb   re   ri   rm   rq   ru   rx   rz   r   r   r   r   r   r   r   r   r(   r(   !   s         .6
 
B .F6F1I+I)I% % %   [ I I [I I I [I   [* 
 
 [
 
 
 [
 3 3 [3 O O [O O O [O
 
 


 
 


 
 


 
 


 
 

 
 


 
 

 
 


 
 
 
 
r   r(   c                   F    e Zd ZdZdddddddd	d
ddddZed             ZdS )rP   zu
    Class for converting msd tags to universal tags, more conversion
    options are currently not implemented.
    ADJADPADVCONJDETNOUNNUMPRTPRONVERBrL   X)ASRCDNMQPVrL   rK   c                 ~    | d         dk    s| d         n| d         }|t           j        vrd}t           j        |         S )z
        This function converts the annotation from the Multex-East to the universal tagset
        as described in Chapter 5 of the NLTK-Book

        Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
        r   #   rK   )rP   mapping_msd_universal)tag	indicators     r   rQ   z MTETagConverter.msd_to_universal   sB     #&a&C--CFFSV	OAAAI4Y??r   N)r#   r$   r%   r&   r   staticmethodrQ   r   r   r   rP   rP      sq            @ @ \@ @ @r   rP   c                   h    e Zd ZdZddZd ZddZddZddZdd	Z	ddZ
ddZddZddZddZdS )MTECorpusReaderz
    Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
    MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
    scheme. These tags can be converted to the Universal tagset
    Nutf8c                 B    t          j        | |||           d| _        dS )a.  
        Construct a new MTECorpusreader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP

        :param root: The root directory for this corpus. (default points to location in multext config file)
        :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
        :param encoding: The encoding of the given files (default is utf8)
        z00README.txtN)r   r   _readme)r   r
   fileidsencodings       r   r   zMTECorpusReader.__init__   s&     	#D$BBB%r   c                      | j         }nt          |t                    r|g}t           fd|          }t          d |          }|st	          d           |S )Nc                     | j         v S r   )_fileids)r   r   s    r   r   z+MTECorpusReader.__fileids.<locals>.<lambda>   s    1#5 r   c                 
    | dvS )N)zoana-bg.xmlzoana-mk.xmlr   r   s    r   r   z+MTECorpusReader.__fileids.<locals>.<lambda>   s    1,J#J r   z$No valid multext-east file specified)r   
isinstancestrr    printr   r   s   ` r   	__fileidszMTECorpusReader.__fileids   sr    ?mGG%% 	 iG5555w??JJGTT 	:8999r   c                 `     t           fd                     |          D                       S )z
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        c                     g | ]F}t          t          j                            j        |                                                    GS r   )r(   osr   join_rootrq   r8   fr   s     r   r:   z)MTECorpusReader.words.<locals>.<listcomp>   O        bgll4:q99::@@BB  r   r   _MTECorpusReader__fileidsr   s   ` r   rq   zMTECorpusReader.words   sI        00  
 
 	
r   c                 `     t           fd                     |          D                       S )z
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of sentences or utterances,
                 each encoded as a list of word strings
        :rtype: list(list(str))
        c                     g | ]F}t          t          j                            j        |                                                    GS r   )r(   r   r   r   r   ru   r   s     r   r:   z)MTECorpusReader.sents.<locals>.<listcomp>	  r   r   r   r   s   ` r   ru   zMTECorpusReader.sents  I        00  
 
 	
r   c                 `     t           fd                     |          D                       S )a  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of paragraphs, each encoded as a list
                 of sentences, which are in turn encoded as lists of word string
        :rtype: list(list(list(str)))
        c                     g | ]F}t          t          j                            j        |                                                    GS r   )r(   r   r   r   r   rx   r   s     r   r:   z)MTECorpusReader.paras.<locals>.<listcomp>  r   r   r   r   s   ` r   rx   zMTECorpusReader.paras  r   r   c                 `     t           fd                     |          D                       S )a  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of words, the corresponding lemmas
                 and punctuation symbols, encoded as tuples (word, lemma)
        :rtype: list(tuple(str,str))
        c                     g | ]F}t          t          j                            j        |                                                    GS r   )r(   r   r   r   r   rz   r   s     r   r:   z/MTECorpusReader.lemma_words.<locals>.<listcomp>%  O        bgll4:q99::FFHH  r   r   r   s   ` r   rz   zMTECorpusReader.lemma_words  r   r   rH   rG   c                      dk    sdk    r0t           fd                     |          D                       S t          d           dS )a;  
        :param fileids: A list specifying the fileids that should be used.
        :param tagset: The tagset that should be used in the returned object,
                       either "universal" or "msd", "msd" is the default
        :param tags: An MSD Tag that is used to filter all parts of the used corpus
                     that are not more precise or at least equal to the given tag
        :return: the given file(s) as a list of tagged words and punctuation symbols
                 encoded as tuples (word, tag)
        :rtype: list(tuple(str, str))
        rI   rH   c                     g | ]H}t          t          j                            j        |                                                  IS r   )r(   r   r   r   r   r   r8   r   r   rV   r~   s     r   r:   z0MTECorpusReader.tagged_words.<locals>.<listcomp>8  Z         ""',,tz1"="=>>KK   r   Unknown tagset specified.Nr   r   r   r   r   r~   rV   s   ` ``r   r   zMTECorpusReader.tagged_words+       [  FeOO      "^^G44	     -.....r   c                 `     t           fd                     |          D                       S )aB  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of sentences or utterances, each
                 encoded as a list of tuples of the word and the corresponding
                 lemma (word, lemma)
        :rtype: list(list(tuple(str, str)))
        c                     g | ]F}t          t          j                            j        |                                                    GS r   )r(   r   r   r   r   r   r   s     r   r:   z/MTECorpusReader.lemma_sents.<locals>.<listcomp>K  r   r   r   r   s   ` r   r   zMTECorpusReader.lemma_sentsB  I        00  
 
 	
r   c                      dk    sdk    r0t           fd                     |          D                       S t          d           dS )aH  
        :param fileids: A list specifying the fileids that should be used.
        :param tagset: The tagset that should be used in the returned object,
                       either "universal" or "msd", "msd" is the default
        :param tags: An MSD Tag that is used to filter all parts of the used corpus
                     that are not more precise or at least equal to the given tag
        :return: the given file(s) as a list of sentences or utterances, each
                 each encoded as a list of (word,tag) tuples
        :rtype: list(list(tuple(str, str)))
        rI   rH   c                     g | ]H}t          t          j                            j        |                                                  IS r   )r(   r   r   r   r   r   r   s     r   r:   z0MTECorpusReader.tagged_sents.<locals>.<listcomp>^  r   r   r   Nr   r   s   ` ``r   r   zMTECorpusReader.tagged_sentsQ  r   r   c                 `     t           fd                     |          D                       S )am  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of paragraphs, each encoded as a
                 list of sentences, which are in turn encoded as a list of
                 tuples of the word and the corresponding lemma (word, lemma)
        :rtype: list(List(List(tuple(str, str))))
        c                     g | ]F}t          t          j                            j        |                                                    GS r   )r(   r   r   r   r   r   r   s     r   r:   z/MTECorpusReader.lemma_paras.<locals>.<listcomp>q  r   r   r   r   s   ` r   r   zMTECorpusReader.lemma_parash  r   r   c                      dk    sdk    r0t           fd                     |          D                       S t          d           dS )a  
        :param fileids: A list specifying the fileids that should be used.
        :param tagset: The tagset that should be used in the returned object,
                       either "universal" or "msd", "msd" is the default
        :param tags: An MSD Tag that is used to filter all parts of the used corpus
                     that are not more precise or at least equal to the given tag
        :return: the given file(s) as a list of paragraphs, each encoded as a
                 list of sentences, which are in turn encoded as a list
                 of (word,tag) tuples
        :rtype: list(list(list(tuple(str, str))))
        rI   rH   c                     g | ]H}t          t          j                            j        |                                                  IS r   )r(   r   r   r   r   r   r   s     r   r:   z0MTECorpusReader.tagged_paras.<locals>.<listcomp>  r   r   r   Nr   r   s   ` ``r   r   zMTECorpusReader.tagged_parasw  s     [  FeOO      "^^G44	     -.....r   )NNr   r   )NrH   rG   )r#   r$   r%   r&   r   r   rq   ru   rx   rz   r   r   r   r   r   r   r   r   r   r      s         & & & &  
 
 
 

 
 
 

 
 
 

 
 
 
/ / / /.
 
 
 
/ / / /.
 
 
 
/ / / / / /r   r   )r&   r   rR   	functoolsr   nltk.corpus.readerr   r   nltk.corpus.reader.xmldocsr   r   r   r(   rP   r   r   r   r   <module>r      s9    
			 				       9 9 9 9 9 9 9 9 4 4 4 4 4 4" " "
 
 
 
 
M 
 
 
"H
 H
 H
 H
 H
 H
 H
 H
V"@ "@ "@ "@ "@ "@ "@ "@J|/ |/ |/ |/ |/( |/ |/ |/ |/ |/r   