
    '[fs                         d Z ddlZddlmZ ddlT ddlT ddlT  G d d          Z G d de          Z	 G d	 d
e
          Zd ZdS )a  
Read from the Senseval 2 Corpus.

SENSEVAL [http://www.senseval.org/]
Evaluation exercises for Word Sense Disambiguation.
Organized by ACL-SIGLEX [https://www.siglex.org/]

Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
https://www.d.umn.edu/~tpederse/data.html
Distributed with permission.

The NLTK version of the Senseval 2 files uses well-formed XML.
Each instance of the ambiguous words "hard", "interest", "line", and "serve"
is tagged with a sense identifier, and supplied with context.
    N)ElementTree)*c                       e Zd Zd Zd ZdS )SensevalInstancec                 X    || _         t          |          | _        || _        || _        d S N)wordtuplesensespositioncontext)selfr	   r   r   r   s        O/var/www/piapp/venv/lib/python3.11/site-packages/nltk/corpus/reader/senseval.py__init__zSensevalInstance.__init__"   s(    	Fmm     c           	      H    d| j         d| j        d| j        d| j        d	S )NzSensevalInstance(word=z, position=z
, context=z	, senses=))r	   r   r   r   )r   s    r   __repr__zSensevalInstance.__repr__(   s2     IIIMMMLLLKKK	
 	
r   N)__name__
__module____qualname__r   r    r   r   r   r   !   s2          
 
 
 
 
r   r   c                       e Zd ZddZd ZdS )SensevalCorpusReaderNc                 \    t          d |                     |d          D                       S )Nc                 4    g | ]\  }}t          ||          S r   )SensevalCorpusView).0fileidencs      r   
<listcomp>z2SensevalCorpusReader.instances.<locals>.<listcomp>4   s6       !VS #63//  r   T)concatabspaths)r   fileidss     r   	instanceszSensevalCorpusReader.instances2   s>     %)]]7D%A%A  
 
 	
r   c                     g }|                     d          D ]V}|                     d          D ]>}|d         j        d         }d |d         D             }|                    ||f           ?W|S )Nlexeltinstancer   senseidc                 6    g | ]}|j         |j        d          fS )pos)textattrib)r   ws     r   r!   z/SensevalCorpusReader._entry.<locals>.<listcomp>?   s%    FFFAFAHUO4FFFr      )findallr-   append)r   treeeltsr'   instsenser   s          r   _entryzSensevalCorpusReader._entry:   s    ll8,, 	. 	.Fz22 . .Qy1FFd1gFFFUG,----. r   r   )r   r   r   r%   r6   r   r   r   r   r   1   s7        
 
 
 
    r   r   c                        e Zd Zd Zd Zd ZdS )r   c                     t                               | ||           t                      | _        dg| _        d g| _        d S )N)encodingr   )StreamBackedCorpusViewr   WhitespaceTokenizer_word_tokenizer_lexelt_starts_lexelts)r   r   r9   s      r   r   zSensevalCorpusView.__init__E   sB    ''fx'HHH244 cr   c                     t                               | j        |                                          dz
  }| j        |         }g }d}	 |                                }|dk    r
|g k    sJ g S |                                                    d          r|dz  }t          j	        d|          }|J |
                    d          dd         }|t          | j                  k     r|| j        |         k    sJ nF| j                            |           | j                            |                                           |                                                    d          r
|g k    sJ d}|r|                    |           |                                                    d	          rOd
                    |          }t          |          }t          j        |          }	|                     |	|          gS )Nr/   FT z<lexeltzitem=("[^"]+"|'[^']+')z	<instancez
</instance
)bisectbisect_rightr=   tellr>   readlinelstrip
startswithresearchgrouplenr1   join_fixXMLr   
fromstring_parse_instance)
r   stream
lexelt_numr'   instance_linesin_instancelinem	xml_blockr4   s
             r   
read_blockzSensevalCorpusView.read_blockL   s   (()<fkkmmLLqP
z* 	<??$$Drzz%++++	 {{}}''	22 	>a
I94@@}}}AbD)DM 2 222!T]:%>>>>>>M((000'..v{{}}=== {{}}''44 #%++++"  ,%%d+++ {{}}''55 < IIn55	#I..	"-i88,,T6::;;A 	<r   c                    g }g }d }|D ]]}|j         dk    r!|                    |j        d                    /|j         dk    r|| j                            |j                  z  }|D ]}|j         dk    r|d         }|j         dk    rA|
J d            |j                                        st          |          dk    sJ |j                                        rt          |          dk    rJ t          |          }|j                                        r-|                    |j                                                   n|d         j         d	k    ri|                    |d         j        |d         j        d
         f           |d         j        r(|| j                            |d         j                  z  }nhJ d            |j         d	k    r(|                    |j        |j        d
         f           n+|j         dk    rnt          d|j                    J d            |j        r"|| j                            |j                  z  }MJ d|j         z              t          ||||          S )Nanswerr)   r   compoundr   headzhead specified twicer/   wfr+   Fzexpected CDATA or wf in <head>sACKz expected CDATA or <wf> or <head>zunexpected tag %s)tagr1   r-   r<   tokenizer,   striprL   tailprintr   )r   r(   r'   r   r   r   childcwords           r   rP   z"SensevalCorpusView._parse_instanceu   s    $	> $	>EyH$$el956666i''4/88DDD" M MEyJ.. %ayF**'//1G///$z//11DSZZ1____$)J$4$4$6$6K3u::????#&w<< :++-- K#NN5:+;+;+=+=>>>>"1X\T11#NNE!HM58?5;Q+RSSS$Qx} X '4+?+H+Hq+W+W WJ*JJJJd**
EL4G'HIIIIc)) eUY///H&HHHHz M4#7#@#@#L#LL;M> >1EI===='6BBBr   N)r   r   r   r   rX   rP   r   r   r   r   r   D   sG          '< '< '<R)C )C )C )C )Cr   r   c                    t          j        dd|           } t          j        dd|           } t          j        dd|           } t          j        dd|           } t          j        d	d
|           } t          j        dd|           } t          j        dd|           } t          j        dd|           } t          j        dd|           } t          j        dd|           } t          j        dd|           } t          j        dd|           } t          j        dd|           } t          j        dd|           } t          j        dd|           } | S )z:
    Fix the various issues with Senseval pseudo-XML.
    z	<([~\^])>z\1z(\s+)\&(\s+)z	\1&amp;\2z"""z'"'z(<[^<]*snum=)([^">]+)>z\1"\2"/>z<\&frasl>\s*<p[^>]*>FRASLz
<\&I[^>]*>r@   z<{([^}]+)}>z	<(@|/?p)>z	<&\w+ \.>z<!DOCTYPE[^>]*>z<\[\/?[^>]+\]*>z
<(\&\w+;)>z&(?!amp|gt|lt|apos|quot)z'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>z <wf pos="\2">\1</wf>z\s*"\s*<p=\'"\'/>z <wf pos='"'>"</wf>)rI   sub)r,   s    r   rN   rN      s5   
 6,t,,D6/<66D6&&$''D6+[$??D6)7D99D6-T**D6.%..D6,T**D6,T**D6$c400D6$c400D6---D6-sD99D624Ld D 6&(?FFDKr   )__doc__rI   	xml.etreer   nltk.corpus.reader.apinltk.corpus.reader.utilnltk.tokenizer   CorpusReaderr   r:   r   rN   r   r   r   <module>rp      s      
			 ! ! ! ! ! ! $ $ $ $ % % % %    
 
 
 
 
 
 
 
     <   &ZC ZC ZC ZC ZC/ ZC ZC ZCz# # # # #r   