
    '[f%                         d Z ddlmZ ddlmZmZmZ  G d de          ZddZ G d d	e	          Z
 G d
 de          ZdS )zACorpus reader for the XML version of the British National Corpus.    )concat)ElementTreeXMLCorpusReaderXMLCorpusViewc                   H    e Zd ZdZddZddZddZddZdd	Zdd
Z	d Z
dS )BNCCorpusReadera7  Corpus reader for the XML version of the British National Corpus.

    For access to the complete XML data structure, use the ``xml()``
    method.  For access to simple word lists and tagged word lists, use
    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.

    You can obtain the full version of the BNC corpus at
    https://www.ota.ox.ac.uk/desc/2554

    If you extracted the archive to a directory called `BNC`, then you can
    instantiate the reader as::

        BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')

    Tc                 @    t          j        | ||           || _        d S N)r   __init___lazy)selfrootfileidslazys       J/var/www/piapp/venv/lib/python3.11/site-packages/nltk/corpus/reader/bnc.pyr   zBNCCorpusReader.__init__   s"     tW555


    NFc                 4    |                      |dd||          S )aT  
        :return: the given file(s) as a list of words
            and punctuation symbols.
        :rtype: list(str)

        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        FN_viewsr   r   strip_spacestems       r   wordszBNCCorpusReader.words#   s     {{7E4dCCCr   c                 @    |rdnd}|                      |d|||          S )a   
        :return: the given file(s) as a list of tagged
            words and punctuation symbols, encoded as tuples
            ``(word,tag)``.
        :rtype: list(tuple(str,str))

        :param c5: If true, then the tags used will be the more detailed
            c5 tags.  Otherwise, the simplified tags will be used.
        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        c5posFr   r   r   r   r   r   tags         r   tagged_wordszBNCCorpusReader.tagged_words/   s,     #dde{{7E3TBBBr   c                 4    |                      |dd||          S )a  
        :return: the given file(s) as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        :rtype: list(list(str))

        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        TNr   r   s       r   sentszBNCCorpusReader.sents?   s     {{7D$TBBBr   c                 B    |rdnd}|                      |d|||          S )a  
        :return: the given file(s) as a list of
            sentences, each encoded as a list of ``(word,tag)`` tuples.
        :rtype: list(list(tuple(str,str)))

        :param c5: If true, then the tags used will be the more detailed
            c5 tags.  Otherwise, the simplified tags will be used.
        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        r   r   T)sentr   r   r   r   r   s         r   tagged_sentszBNCCorpusReader.tagged_sentsL   s8     #dde{{$C[t  
 
 	
r   c                     | j         rt          n| j        t          fd|                     |          D                       S )zPA helper function that instantiates BNCWordViews or the list of words/sentences.c           	      .    g | ]} |          S  r'   ).0fileidfr#   r   r   r   s     r   
<listcomp>z*BNCCorpusReader._views.<locals>.<listcomp>a   s;        &$[$77  r   )r   BNCWordView_wordsr   abspaths)r   r   r#   r   r   r   r*   s     ````@r   r   zBNCCorpusReader._views]   sq    :6KK4;       "mmG44  
 
 	
r   c           	         g }t          j        |                                          }|                    d          D ]}g }	t	          |          D ]}
|
j        }|sd}|s|r|                                }|r|
                    d|          }|dk    r||
                    d          f}n1|dk    r+||
                    d|
                    d                    f}|	                    |           |r/|                    t          |j
        d         |	                     |                    |	           d|vsJ |S )a  
        Helper used to implement the view methods -- returns a list of
        words or a list of sentences, optionally tagged.

        :param fileid: The name of the underlying file.
        :param bracket_sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        z.//s hwr   r   nN)r   parsegetrootfindall_all_xmlwords_intextstripgetappendBNCSentenceattribextend)r   r)   bracket_sentr   r   r   resultxmldocxmlsentr#   xmlwordwords               r   r-   zBNCCorpusReader._wordsg   sc    "6**2244~~f-- 	$ 	$GD+G44 " "| D ($ (::<<D 3";;tT22D$;; '++d"3"34DDE\\ '++eW[[5F5F"G"GHDD!!!! $k'.*=tDDEEEEd####6!!!!r   )T)NTF)NFTF)NFFTF)__name__
__module____qualname____doc__r   r   r   r!   r$   r   r-   r'   r   r   r   r      s             
D 
D 
D 
DC C C C C C C C
 
 
 
"
 
 
 
# # # # #r   r   Nc                 v    |g }| D ]1}|j         dv r|                    |           !t          ||           2|S )N)cw)r   r:   r6   )eltr?   childs      r   r6   r6      sU    ~ , ,9
""MM%    UF++++Mr   c                       e Zd ZdZd ZdS )r;   z
    A list of words, augmented by an attribute ``num`` used to record
    the sentence identifier (the ``n`` attribute from the XML).
    c                 J    || _         t                              | |           d S r
   )numlistr   )r   rO   itemss      r   r   zBNCSentence.__init__   s#    dE"""""r   N)rD   rE   rF   rG   r   r'   r   r   r;   r;      s-         
# # # # #r   r;   c                   :    e Zd ZdZh dZ	 d Zd Zd Zd Zd Z	dS )	r,   zN
    A stream backed corpus view specialized for use with the BNC corpus.
    >   pbgapaligneventpauseshiftvocalunclearc                 T   |rd}nd}|| _         || _        || _        || _        d| _        d| _        d| _        d| _        t          j	        | ||           | 
                                 |                     | j        d| j                   |                                  ddi| _        dS )aG  
        :param fileid: The name of the underlying file.
        :param sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        z.*/sz.*/s/(.*/)?(c|w)Nz.*/teiHeader$r   r'   )_sent_tag_strip_space_stemtitleauthoreditorrespsr   r   _open
read_block_streamhandle_headerclose_tag_context)r   r)   r#   r   r   r   tagspecs          r   r   zBNCWordView.__init__   s      	)GG(G
	'


tVW555 	

ot7IJJJ

 Gr   c                    |                     d          }|r$d                    d |D                       | _        |                     d          }|r$d                    d |D                       | _        |                     d          }|r$d                    d |D                       | _        |                     d          }|r&d	                    d
 |D                       | _        d S d S )NztitleStmt/title
c              3   H   K   | ]}|j                                         V  d S r
   r7   r8   )r(   r`   s     r   	<genexpr>z,BNCWordView.handle_header.<locals>.<genexpr>   s0      "J"J%5:#3#3#5#5"J"J"J"J"J"Jr   ztitleStmt/authorc              3   H   K   | ]}|j                                         V  d S r
   rn   )r(   ra   s     r   ro   z,BNCWordView.handle_header.<locals>.<genexpr>   0      #N#NFFK$5$5$7$7#N#N#N#N#N#Nr   ztitleStmt/editorc              3   H   K   | ]}|j                                         V  d S r
   rn   )r(   rb   s     r   ro   z,BNCWordView.handle_header.<locals>.<genexpr>   rq   r   ztitleStmt/respStmtz

c              3   T   K   | ]#}d                      d |D                       V  $dS )rl   c              3   H   K   | ]}|j                                         V  d S r
   rn   )r(   resp_elts     r   ro   z6BNCWordView.handle_header.<locals>.<genexpr>.<genexpr>   s0      EEH(---//EEEEEEr   N)join)r(   resps     r   ro   z,BNCWordView.handle_header.<locals>.<genexpr>   sN       % %JN		EEEEEEE% % % % % %r   )r5   rv   r`   ra   rb   rc   )r   rK   contexttitlesauthorseditorsrc   s          r   rg   zBNCWordView.handle_header   s   .// 	K"J"J6"J"J"JJJDJ++011 	O))#N#Ng#N#N#NNNDK++011 	O))#N#Ng#N#N#NNNDK011 	 % %RW% % %  DJJJ	 	r   c                 d    | j         r|                     |          S |                     |          S r
   )r\   handle_senthandle_word)r   rK   rx   s      r   
handle_eltzBNCWordView.handle_elt   s4    : 	)##C(((##C(((r   c                 L   |j         }|sd}| j        s| j        r|                                }| j        r|                    d|          }| j        dk    r||                    d          f}n6| j        dk    r+||                    d|                    d                    f}|S )Nr0   r1   r   r   )r7   r^   r_   r8   r9   r]   )r   rK   rC   s      r   r~   zBNCWordView.handle_word   s    x 	D 	 
 	 ::<<D: 	'774&&D9#''$--(DDY%#''%778Dr   c                 ,    g }|D ]t}|j         dv r| fd|D             z  }|j         dv r)|                                         |                     O|j          j        vrt	          d|j         z            ut          |j        d         |          S )N)mwhicorrtruncc                 :    g | ]}                     |          S r'   )r~   )r(   rJ   r   s     r   r+   z+BNCWordView.handle_sent.<locals>.<listcomp>  s'    <<<))!,,<<<r   )rJ   rI   zUnexpected element %sr2   )r   r:   r~   tags_to_ignore
ValueErrorr;   r<   )r   rK   r#   rL   s   `   r   r}   zBNCWordView.handle_sent   s     	F 	FEy999<<<<e<<<<j((D,,U334444$"555 !859!DEEE 63:c?D111r   N)
rD   rE   rF   rG   r   r   rg   r   r~   r}   r'   r   r   r,   r,      s         	 	 	N$ $ $@  () ) )  	2 	2 	2 	2 	2r   r,   r
   )rG   nltk.corpus.reader.utilr   nltk.corpus.reader.xmldocsr   r   r   r   r6   rP   r;   r,   r'   r   r   <module>r      s    H G * * * * * * R R R R R R R R R R| | | | |o | | |~   # # # # #$ # # #f2 f2 f2 f2 f2- f2 f2 f2 f2 f2r   