
    '[f?                         d Z ddlZddlmZ ddlmZ ddlT ddlmZ ddl	m
Z
 ddlmZ  G d	 d
e          Z G d de          ZdS )z
Corpus reader for corpora whose documents are xml files.

(note -- not named 'xml' to avoid conflicting w/ standard xml package)
    N)ElementTree)CorpusReader)*)SeekableUnicodeStreamReader)ElementWrapper)WordPunctTokenizerc                   *    e Zd ZdZddZddZddZdS )	XMLCorpusReadera  
    Corpus reader for corpora whose documents are xml files.

    Note that the ``XMLCorpusReader`` constructor does not take an
    ``encoding`` argument, because the unicode encoding is specified by
    the XML files themselves.  See the XML specs for more info.
    Fc                 @    || _         t          j        | ||           d S N)_wrap_etreer   __init__)selfrootfileids
wrap_etrees       N/var/www/piapp/venv/lib/python3.11/site-packages/nltk/corpus/reader/xmldocs.pyr   zXMLCorpusReader.__init__!   s%    %dD'22222    Nc                    |%t          | j                  dk    r| j        d         }t          |t                    st	          d          |                     |                                          5 }t          j        |          	                                }d d d            n# 1 swxY w Y   | j
        rt          |          }|S )N   r   z(Expected a single file identifier string)len_fileids
isinstancestr	TypeErrorabspathopenr   parsegetrootr   r   )r   fileidfpelts       r   xmlzXMLCorpusReader.xml%   s    >c$-00A55]1%F&#&& 	HFGGG\\&!!&&(( 	2B#B''//11C	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2  	& %%C
s   3'B&&B*-B*c                    |                      |          }|                     |          }t                      }	 |                                }n#  |                                }Y nxY wg }|D ]_}|j        }|Tt          |t                    r|                    |          }|	                    |          }	|
                    |	           `|S )aE  
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        )r#   encodingr   getiteratoritertextr   bytesdecodetokenizeextend)
r   r    r"   r%   word_tokenizeriteratoroutnoder(   tokss
             r   wordszXMLCorpusReader.words4   s     hhv==((+--	"((HH	"xxzzHHH 	! 	!D9DdE** 1;;x00D%..t44

4   
s   A A')Fr   )__name__
__module____qualname____doc__r   r#   r2    r   r   r
   r
      sZ         3 3 3 3        r   r
   c                       e Zd ZdZdZdZddZd Zd Ze	
                    de	j        e	j        z            Ze	
                    d	          Ze	
                    d
e	j        e	j        z            Zd ZddZdS )XMLCorpusViewam  
    A corpus view that selects out specified elements from an XML
    file, and provides a flat list-like interface for accessing them.
    (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself,
    but may be used by subclasses of ``XMLCorpusReader``.)

    Every XML corpus view has a "tag specification", indicating what
    XML elements should be included in the view; and each (non-nested)
    element that matches this specification corresponds to one item in
    the view.  Tag specifications are regular expressions over tag
    paths, where a tag path is a list of element tag names, separated
    by '/', indicating the ancestry of the element.  Some examples:

      - ``'foo'``: A top-level element whose tag is ``foo``.
      - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent
        is a top-level element whose tag is ``foo``.
      - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere
        in the xml tree.
      - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``,
        appearing anywhere in the xml tree.

    The view items are generated from the selected XML elements via
    the method ``handle_elt()``.  By default, this method returns the
    element as-is (i.e., as an ElementTree object); but it can be
    overridden, either via subclassing or via the ``elt_handler``
    constructor parameter.
    Fi   Nc                     |r|| _         t                              |dz             | _        	 ddi| _        	 |                     |          }t                              | ||           dS )aW  
        Create a new corpus view based on a specified XML file.

        Note that the ``XMLCorpusView`` constructor does not take an
        ``encoding`` argument, because the unicode encoding is
        specified by the XML files themselves.

        :type tagspec: str
        :param tagspec: A tag specification, indicating what XML
            elements should be included in the view.  Each non-nested
            element that matches this specification corresponds to one
            item in the view.

        :param elt_handler: A function used to transform each element
            to a value for the view.  If no handler is specified, then
            ``self.handle_elt()`` is called, which returns the element
            as an ElementTree object.  The signature of elt_handler is::

                elt_handler(elt, tagspec) -> value
        z\Zr   r7   )r%   N)
handle_eltrecompile_tagspec_tag_context_detect_encodingStreamBackedCorpusViewr   )r   r    tagspecelt_handlerr%   s        r   r   zXMLCorpusView.__init__u   st    *  	*)DO

7U?339G	
 ((00''fx'HHHHHr   c                    t          |t                    rW	 |                                }|                                }|                                 nU# |                                 w xY wt          |d          5 }|                                }d d d            n# 1 swxY w Y   |                    t          j                  rdS |                    t          j                  rdS |                    t          j	                  rdS |                    t          j
                  rdS |                    t          j                  rdS t                              d|          }|r'|                    d                                          S t                              d	|          }|r'|                    d                                          S dS )
Nrbz	utf-16-bez	utf-16-lez	utf-32-bez	utf-32-lezutf-8s!   \s*<\?xml\b.*\bencoding="([^"]+)"r   s!   \s*<\?xml\b.*\bencoding='([^']+)')r   PathPointerr   readlineclose
startswithcodecsBOM_UTF16_BEBOM_UTF16_LEBOM_UTF32_BEBOM_UTF32_LEBOM_UTF8r<   matchgroupr*   )r   r    infilesms        r   r@   zXMLCorpusView._detect_encoding   s   fk** 	&OO%%fd## &vOO%%& & & & & & & & & & & & & & &<<+,, 	;<<+,, 	;<<+,, 	;<<+,, 	;<<(( 	7HH:A>> 	'771::$$&&&HH:A>> 	'771::$$&&&ws   (A A*=BB"%B"c                     |S )a  
        Convert an element into an appropriate value for inclusion in
        the view.  Unless overridden by a subclass or by the
        ``elt_handler`` constructor argument, this method simply
        returns ``elt``.

        :return: The view value corresponding to ``elt``.

        :type elt: ElementTree
        :param elt: The element that should be converted.

        :type context: str
        :param context: A string composed of element tags separated by
            forward slashes, indicating the XML context of the given
            element.  For example, the string ``'foo/bar/baz'``
            indicates that the element is a ``baz`` element whose
            parent is a ``bar`` element and whose grandparent is a
            top-level ``foo`` element.
        r7   )r   r"   contexts      r   r;   zXMLCorpusView.handle_elt   s	    ( 
r   a;  
        [^<]*
        (
          ((<!--.*?-->)                         |  # comment
           (<![CDATA[.*?]])                     |  # raw character data
           (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) |  # doctype decl
           (<[^!>][^>]*>))                         # tag or PI
          [^<]*)*
        \Zz<\s*(?:/\s*)?([^\s>]+)a6  
        # Include these so we can skip them:
        (?P<COMMENT>        <!--.*?-->                          )|
        (?P<CDATA>          <![CDATA[.*?]]>                     )|
        (?P<PI>             <\?.*?\?>                           )|
        (?P<DOCTYPE>        <!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>)|
        # These are the ones we actually care about:
        (?P<EMPTY_ELT_TAG>  <\s*[^>/\?!\s][^>]*/\s*>            )|
        (?P<START_TAG>      <\s*[^>/\?!\s][^>]*>                )|
        (?P<END_TAG>        <\s*/[^>/\?!\s][^>]*>               )c                 ~   d}t          |t                    r|                                }	 |                    | j                  }||z  }| j                            |          r|S t                              d|          	                    d          dk    rd|                                t          |          t                              d|                                          z
  z
  }t          d|z            |st          d          |                    d          }|dk    r| j                            |d	|                   rqt          |t                    r+|                    |           |                    |           n'|                    t          |          |z
   d
           |d	|         S )a{  
        Read a string from the given stream that does not contain any
        un-closed tags.  In particular, this function first reads a
        block from the stream of size ``self._BLOCK_SIZE``.  It then
        checks if that block contains an un-closed tag.  If it does,
        then this function either backtracks to the last '<', or reads
        another block.
         Tz[<>]r   >zUnexpected ">" near char %sz&Unexpected end of file: tag not closed<Nr   )r   r   tellread_BLOCK_SIZE_VALID_XML_RErP   r<   searchrQ   r   end
ValueErrorrfindseekchar_seek_forward)r   streamfragmentstartpos	xml_blockposlast_open_brackets          r   _read_xml_fragmentz XMLCorpusView._read_xml_fragment   s    f9:: 	%{{}}H	8D$455I	!H !''11   yy**0033s::kkmmMMBIIfh$?$?$C$C$E$EE !!>!DEEE  K !IJJJ
 !)s 3 3 1$$%++H5G6G5G,HII 8!&*EFF MH---001BCCCCc(mm6G&G$H!LLL#$6%6$677?	8r   c                 	   || j         }| j        t          | j                            |                                                    }|J g }d}d}d}|g k    s|t          |t                    r|                                }	|                     |          }
|
s|nt          d          | j
                            |
          D ]}| j        rPt          d                    d                    |          dd         |                                                     |                    d          r| j                            |                                                              d          }|                    |           |Qt(                              |d                    |                    r#|                                }t-          |          }|                    d	          r| j                            |                                                              d          }|st          d
|z            ||d         k    rt          d|d          d| d          |b|t-          |          k    rO||
||                                         z  }|                    |d                    |          f           dx}}d}|                                 "|                    d          r| j                            |                                                              d          }|vt(                              |d                    |          dz   |z             rB|                    |                                d                    |          dz   |z   f           ||g k    r||
|d         z  }d}n| j        rt          d           t          |t                    r+|                    |	           |                    |           n'|                    t-          |
          |z
   d           |d|dz
           }dx}}d}|g k    ||                                }|| j        v r!t7          |          | j        |         k    sJ nt7          |          | j        |<   fd|D             S )z
        Read from ``stream`` until we find at least one element that
        matches ``tagspec``, and return the result of applying
        ``elt_handler`` to each element found.
        NrX   zUnexpected end of filez	{:>25} {}/i	START_TAGr   END_TAGzUnmatched tag </%s>zUnmatched tag <z>...</rY   EMPTY_ELT_TAGr   z/                                    (backtrack)c           
      z    g | ]7\  }} t          j        |                    d d                    |          8S )asciixmlcharrefreplace)r   
fromstringencode).0r"   rV   rC   s      r   
<listcomp>z,XMLCorpusView.read_block.<locals>.<listcomp>  s[     
 
 

 g	 K&szz';N'O'OPP 
 
 
r   )r>   r;   listr?   getr[   r   r   rk   ra   
_XML_PIECEfinditer_DEBUGprintformatjoinrQ   _XML_TAG_NAMErP   appendr<   startr   r`   poprc   rd   tuple)r   re   rB   rC   rV   elts	elt_start	elt_depthelt_textrg   xml_fragmentpiecenameri   s      `          r   
read_blockzXMLCorpusView.read_block"  s    ?mG/K t(,,V[[]];;<<"""		bjjI1&"=>> )!;;==226::L   ?$$%=>>> 11,?? "Y "Y; V+,,SXXg->->stt-DekkmmTTUUU;;{++ Y-33EKKMMBBHHKKDNN4((( (88GSXXg->->?? 5(-I(+GI[[++ Y-33EKKMMBBHHKKD" G()>)EFFFwr{**()U72;)U)Ud)U)U)UVVV ,c'll1J1J LUYY[[1H$IIXsxx/@/@$ABBB044	I#%KKMMMM[[11 Y-33EKKMMBBHHKKD (88GSXXg->->-Dt-KLL Y KK8I8IC8ORV8V(WXXX$ 2::YZZ 88H !II { 86777!&*EFF IH---00;;;;c,&7&7)&C$DaHHH%o	Ao6G,00I	!HO bjjI1T kkmm$###>>T%6s%;;;;;;%*7^^Dc"
 
 
 

 #'
 
 
 	
r   r   )NN)r3   r4   r5   r6   r}   r]   r   r@   r;   r<   r=   DOTALLVERBOSEr^   r   r{   rk   r   r7   r   r   r9   r9   Q   s         < F K"I "I "I "IH  :  0 JJ	 		BJ M JJ899M 		E 		BJ J,8 ,8 ,8bk
 k
 k
 k
 k
 k
r   r9   )r6   rJ   	xml.etreer   nltk.corpus.reader.apir   nltk.corpus.reader.util	nltk.datar   nltk.internalsr   nltk.tokenizer   r
   rA   r9   r7   r   r   <module>r      s      ! ! ! ! ! ! / / / / / / % % % % 1 1 1 1 1 1 ) ) ) ) ) ) , , , , , ,6 6 6 6 6l 6 6 6r|
 |
 |
 |
 |
* |
 |
 |
 |
 |
r   