
    zIf+                     N   d Z ddlZddlZddlZddlmZ ddlmZm	Z	 	 ddl
mZ n# e$ r Y nw xY wddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ  G d de          Z G d de          Zd Zd Zd ZddZd Zd ZddZe dk    rddl!mZ  ed            ed           dS dS )z
Named entity chunker
    N)ElementTree)ClassifierBasedTaggerpos_tag)MaxentClassifier)ChunkParserI)
ChunkScore)find)word_tokenize)Treec                   *    e Zd ZdZd Zd Zd Zd ZdS )NEChunkParserTaggerz2
    The IOB tagger used by the chunk parser.
    c                 >    t          j        | || j                   d S )N)trainclassifier_builder)r   __init___classifier_builderselfr   s     K/var/www/piapp/venv/lib/python3.11/site-packages/nltk/chunk/named_entity.pyr   zNEChunkParserTagger.__init__$   s0    &$2J	
 	
 	
 	
 	
 	
    c                 2    t          j        |ddd          S )Nmegam      )	algorithmgaussian_prior_sigmatrace)r   r   r   s     r   r   z'NEChunkParserTagger._classifier_builder)   s%    %W1A
 
 
 	
r   c                     	 | j         }nD# t          $ r7 ddlm} t	          |                    d                    | _         | j         }Y nw xY w|S )Nr   )wordszen-basic)_en_wordlistAttributeErrornltk.corpusr   set)r   wlr   s      r   _english_wordlistz%NEChunkParserTagger._english_wordlist.   sm    	#"BB 	# 	# 	#)))))) #EKK
$;$; < <D"BBB		#
 	s   
 >A
Ac                    ||         d         }t          ||         d                   }|dk    rd x}}d x}}	d x}
x}}n|dk    r[||dz
           d                                         }d }t          ||dz
           d                   }d }	||dz
           d         }d x}
}n||dz
           d                                         }||dz
           d                                         }t          ||dz
           d                   }t          ||dz
           d                   }	||dz
           }||dz
           }t          |          }
|t          |          dz
  k    r	d x}}d x}}n|t          |          dz
  k    rK||dz            d                                         }||dz            d                                         }d }d }n||dz            d                                         }||dz            d                                         }||dz            d                                         }||dz            d                                         }i dddt          |          dt          |          d|d d	                                         d
|dd                                          d|d|d||                                 v d|d|d|d|d|d|                                 d| d| d| d|
 d| }|S )Nr   r   r   biasTshapewordlenprefix3   suffix3poswordzen-wordlistprevtagprevposnextposprevwordnextwordzword+nextpos+zpos+prevtagzshape+prevtag)simplify_poslowerr(   lenr%   )r   tokensindexhistoryr/   r.   r3   prevprevwordr1   prevprevpos	prevshaper0   prevprevtagr4   nextnextwordr2   nextnextposfeaturess                     r   _feature_detectorz%NEChunkParserTagger._feature_detector8   s   e}Q6%=+,,A::&**H|$((Gk044I4++aZZeai(+1133HL"6%!)#4Q#788GKeai(+G&**Ieai(+1133H!%!),Q/5577L"6%!)#4Q#788G&veai'8';<<Keai(G!%!),KhICKK!O##&**H|$((Gkkc&kkAo%%eai(+1133HUQY'*0022GLKKeai(+1133HUQY'*0022G!%!),Q/5577L +A.4466K
D
U4[[
 s4yy
 tBQBx~~''	

 tBCCy((
 3
 D
 DD$:$:$<$<<
 w
 w
 w
 
 
 tzz||77g77
 c--G--
  	55G55!
& r   N)__name__
__module____qualname____doc__r   r   r%   rC    r   r   r   r      sZ         
 
 


 
 

  8 8 8 8 8r   r   c                   @    e Zd ZdZd Zd Zd Zd Zed             Z	dS )NEChunkParserz2
    Expected input: list of pos-tagged words
    c                 0    |                      |           d S N)_trainr   s     r   r   zNEChunkParser.__init__x   s    Er   c                 d    | j                             |          }|                     |          }|S )z8
        Each token should be a pos-tagged word
        )_taggertag_tagged_to_parse)r   r9   taggedtrees       r   parsezNEChunkParser.parse{   s1     !!&))$$V,,r   c                 N      fd|D             }t          |           _        d S )Nc                 :    g | ]}                     |          S rH   )_parse_to_tagged).0sr   s     r   
<listcomp>z(NEChunkParser._train.<locals>.<listcomp>   s'    ;;;q$''**;;;r   )r   )r   rO   )r   corpuss   ` r   rM   zNEChunkParser._train   s2    ;;;;F;;;*888r   c                 6   t          dg           }|D ]\  }}|dk    r|                    |           "|                    d          r-|                    t          |dd         |g                     d|                    d          r|r]t          |d         t                     rB|d                                         |dd         k    r|d                             |           |                    t          |dd         |g                     |S )zH
        Convert a list of tagged tokens to a chunk-parse tree.
        SOB-r   NI-)r   append
startswith
isinstancelabel)r   tagged_tokenssenttokrP   s        r   rQ   zNEChunkParser._tagged_to_parse   s    C}}' 		6 		6JS#czzC    %% 6DQRR3%001111%% 6 6JtBx66 648>>;K;KsSTSUSUw;V;VHOOC((((KKSWse 4 4555r   c                    g }| D ]}t          |t                    rt          |          dk    rt          d           :|                    |d         d|                                 f           |dd         D ].}|                    |d|                                 f           /|                    |df           |S )zH
        Convert a chunk-parse tree to a list of tagged tokens.
        r   z"Warning -- empty chunk in sentencer_   r   Nr`   r^   )rd   r   r8   printrb   re   )rg   tokschildrh   s       r   rW   zNEChunkParser._parse_to_tagged   s    
  		* 		*E%&& *u::??>???U1X';EKKMM';';<=== 9 = =CKK&:5;;==&:&: ;<<<<= UCL))))r   N)
rD   rE   rF   rG   r   rT   rM   rQ   staticmethodrW   rH   r   r   rJ   rJ   s   su             9 9 9  $   \  r   rJ   c                 *   t          j        d| t           j                  rdS t          j        d| t           j                  rdS t          j        d| t           j                  r.|                                 rdS |                                 rdS dS d	S )
Nz![0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$numberz\W+$punctz\w+$upcasedowncase	mixedcaseother)rematchUNICODEistitleislower)r/   s    r   r(   r(      s    	x4dBJGG x	'4	,	, 
w	'4	,	, <<>> 	8\\^^ 	:;wr   c                 f    |                      d          rdS |                     d          d         S )NV-r   )rc   split)rY   s    r   r6   r6      s/    ||C swws||Ar   c                    |                                  }d t          |          D             }t          dg           }| D ]}t          |t                    re|                    t          |                                g                      |D ],}|d                             |t          |          f           -||                    |t          |          f           |S )Nc              3       K   | ]	\  }}|V  
d S rL   rH   )rX   r/   r.   s      r   	<genexpr>zpostag_tree.<locals>.<genexpr>   s&      66s666666r   r]   ra   )leavesr   r   rd   rb   re   next)rS   r   tag_iternewtreerl   subchilds         r   postag_treer      s    KKMME66wu~~666H3mmG 4 4eT"" 	4NN4r22333! ? ?""Hd8nn#=>>>>? NNE4>>23333Nr   binaryTc           	   #     K   | D ]}t          j        |          D ]l\  }}}|                    d          r|r|D ]K}|                    d          r4t          t           j                            ||          |          E d {V  Lmd S )Nbnewsz.sgm)oswalkendswithload_ace_filepathjoin)rootsfmt
skip_bnewsrootdirsfilesfs          r   load_ace_datar      s       I I!# 	I 	ID$}}W%% *  I I::f%% I,RW\\$-B-BCHHHHHHHHHI	II Ir   c           	   #     K   t          dt          j                            |           d                     | dz   }g }t	          |          5 }t          j        |                                          }d d d            n# 1 swxY w Y   |                    d          D ]}|	                    d          j
        }|                    d          D ]}|                    d          dk    rt          |	                    d	          j
                  }	t          |	                    d
          j
                  dz   }
|                    |	|
|f           t	          |           5 }|                                }d d d            n# 1 swxY w Y   t          j        dd|          }d }t          j        d||          }t          j        dd|          }t          j        dd|          }t          j        dd|          }d |D             }|dk    rd}t#          dg           }t%          |          D ]~\  }	}
}|	|k     r|}	|
|	k    r|                    t)          |||	                              |                    t#          d||	|
                                                              |
}|                    t)          ||d                               |V  d S |dk    rd}t#          dg           }t%          |          D ]~\  }	}
}|	|k     r|}	|
|	k    r|                    t)          |||	                              |                    t#          |||	|
                                                              |
}|                    t)          ||d                               |V  d S t+          d          )Nz  - r   z.tmx.rdc.xmlzdocument/entityentity_typeentity_mentionTYPENAMEzhead/charseq/startzhead/charseq/endz<(?!/?TEXT)[^>]+> c                 `    d|                                  |                                 z
  dz
  z  S )N    )endstart)ms    r   subfunczload_ace_file.<locals>.subfunc   s'    aeegg		)A-..r   z[\s\S]*<TEXT>z</TEXT>[\s\S]*z``z "z''z" c                     h | ]\  }}}|	S rH   rH   )rX   rY   etyps       r   	<setcomp>z load_ace_file.<locals>.<setcomp>  s    444KQ3C444r   r   r   r]   NE
multiclasszbad fmt value)rj   r   r   r}   openETrT   getrootfindallr	   textgetintrb   readru   subr   sortedextendr
   
ValueError)textfiler   annfileentitiesinfilexmlentityr   mentionrY   r   r   r   entity_typesirk   s                   r   r   r      s<     	
-x((+
-
-...'G H	g )&hv&&(() ) ) ) ) ) ) ) ) ) ) ) ) ) )++/00 ) )kk-((-~~&677 	) 	)G{{6""f,,GLL!566;<<AGLL!3449::Q>AOOQ3K((((	) 
h 6{{}}               6%r400D/ / / 6"GT22D6#R..D 6$d##D6$d##D448444L hC}}!(++ 	 	KQ31uuAvvKKd1Q3i00111KKT4!9??#4#455666AAM$qrr(++,,,




 
		C}}!(++ 	 	KQ31uuAvvKKd1Q3i00111KKS$qs)//"3"344555AAM$qrr(++,,,




 )))s$   'BBB(F		FFc           	         t                               |           } t                               |          }d}t          | |          D ]z\  \  }}\  }}||cxk    rdk    rFn nC|s@t          d|dd|dd|            t          d                    ddd                     d}^d}t          d|dd|dd|            {d S )	NFr^   z  15r   z  {:15} {:15} {2}...T)rJ   rW   ziprj   format)correctguessedellipsiswctgts         r   
cmp_chunksr   '  s   ,,W55G,,W55GH11 , ,B!R????s?????  .2...2...1..///)00uEEFFFH*r***r***q**++++, ,r   c                 Z   t          d           t          d          t          d          t          d          t          d          g}t          ||           }d |D             }t          d           t          |          }~t          d           t          d	          g}t          ||           }d
 |D             }t          d           t	                      }t          |          D ]X\  }	}
|                    |
                                          }|                    |
|           |	dk     rt          |
|           Yt          |           d|  d}t          d| d           t          |d          5 }t          j        ||d           d d d            n# 1 swxY w Y   |S )NzLoading training data...zcorpora/ace_data/ace.devzcorpora/ace_data/ace.heldoutzcorpora/ace_data/bbn.devzcorpora/ace_data/muc.devc                 ,    g | ]}t          |          S rH   r   rX   ts     r   rZ   zbuild_model.<locals>.<listcomp>?  s    666Q+a..666r   zTraining...zLoading eval data...zcorpora/ace_data/ace.evalc                 ,    g | ]}t          |          S rH   r   r   s     r   rZ   zbuild_model.<locals>.<listcomp>G  s    444AQ444r   zEvaluating...r+   z/tmp/ne_chunker_z.picklezSaving chunker to r   wbra   )rj   r	   r   rJ   r   	enumeraterT   r   scorer   r   pickledump)r   train_pathstrain_trees
train_datacp
eval_paths
eval_trees	eval_data
chunkscorer   r   guessoutfilenameoutfiles                 r   build_modelr   6  s   	
$%%%'((+,,'(('((	K  S11K66+666J	-	z	"	"B	
 !!!2334Jz3//J44444I	/J	** ' '
7))**%(((q55w&&&	*1S111K	
/{
/
/
/000	k4	 	  %GB$$$% % % % % % % % % % % % % % % Is   =F  F$'F$__main__)r   r   )r   T)r   )"rG   r   r   ru   	xml.etreer   r   nltk.tagr   r   nltk.classifyr   ImportErrornltk.chunk.apir   nltk.chunk.utilr   	nltk.datar	   nltk.tokenizer
   	nltk.treer   r   rJ   r(   r6   r   r   r   r   r   rD   nltk.chunk.named_entityrH   r   r   <module>r      s    
			  				 ' ' ' ' ' ' 3 3 3 3 3 3 3 3	....... 	 	 	D	 ( ' ' ' ' ' & & & & & &       ' ' ' ' ' '      Q Q Q Q Q/ Q Q Qh8 8 8 8 8L 8 8 8v       I I I ID* D* D*R, , ," " " "J z333333KK s   % --