
    '[f{                     *   d Z ddlZddlmZ ddlmZmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZmZmZmZmZmZ dd
lmZ ddlmZmZ ddlmZmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z. ddZ/ e            Z0ddZ1dS )a@	  
NLTK Tokenizer Package

Tokenizers divide strings into lists of substrings.  For example,
tokenizers can be used to find the words and punctuation in a string:

    >>> from nltk.tokenize import word_tokenize
    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
    ... two of them.\n\nThanks.'''
    >>> word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

This particular tokenizer requires the Punkt sentence tokenization
models to be installed. NLTK also provides a simpler,
regular-expression based tokenizer, which splits text on whitespace
and punctuation:

    >>> from nltk.tokenize import wordpunct_tokenize
    >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

We can also operate at the level of sentences, using the sentence
tokenizer directly as follows:

    >>> from nltk.tokenize import sent_tokenize, word_tokenize
    >>> sent_tokenize(s)
    ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
    >>> [word_tokenize(t) for t in sent_tokenize(s)] # doctest: +NORMALIZE_WHITESPACE
    [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
    ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]

Caution: when tokenizing a Unicode string, make sure you are not
using an encoded version of the string (it may be necessary to
decode it first, e.g. with ``s.decode("utf8")``.

NLTK tokenizers can produce token-spans, represented as tuples of integers
having the same semantics as string slices, to support efficient comparison
of tokenizers.  (These methods are implemented as generators.)

    >>> from nltk.tokenize import WhitespaceTokenizer
    >>> list(WhitespaceTokenizer().span_tokenize(s)) # doctest: +NORMALIZE_WHITESPACE
    [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
    (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]

There are numerous ways to tokenize text.  If you need more control over
tokenization, see the other methods provided in this package.

For further information, please see Chapter 3 of the NLTK book.
    N)load)TweetTokenizercasual_tokenize)NLTKWordTokenizer)LegalitySyllableTokenizer)MWETokenizer)PunktSentenceTokenizer)BlanklineTokenizerRegexpTokenizerWhitespaceTokenizerWordPunctTokenizerblankline_tokenizeregexp_tokenizewordpunct_tokenize)ReppTokenizer)SExprTokenizersexpr_tokenize)LineTokenizerSpaceTokenizerTabTokenizerline_tokenize)SyllableTokenizer)StanfordSegmenter)TextTilingTokenizer)ToktokTokenizer)TreebankWordDetokenizerTreebankWordTokenizer)regexp_span_tokenizestring_span_tokenizeenglishc                 R    t          d| d          }|                    |           S )a  
    Return a sentence-tokenized copy of *text*,
    using NLTK's recommended sentence tokenizer
    (currently :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus
    ztokenizers/punkt/z.pickle)r   tokenize)textlanguage	tokenizers      J/var/www/piapp/venv/lib/python3.11/site-packages/nltk/tokenize/__init__.pysent_tokenizer'   `   s1     ::::;;Id###    Fc                 D    |r| gnt          | |          }d |D             S )a  
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently an improved :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into words
    :type text: str
    :param language: the model name in the Punkt corpus
    :type language: str
    :param preserve_line: A flag to decide whether to sentence tokenize the text or not.
    :type preserve_line: bool
    c                 L    g | ]!}t                               |          D ]}|"S  )_treebank_word_tokenizerr"   ).0senttokens      r&   
<listcomp>z!word_tokenize.<locals>.<listcomp>   sI       1I1R1RSW1X1X (-   r(   )r'   )r#   r$   preserve_line	sentencess       r&   word_tokenizer3   r   s?     (J]4-J-JI #   r(   )r    )r    F)2__doc__re	nltk.datar   nltk.tokenize.casualr   r   nltk.tokenize.destructiver    nltk.tokenize.legality_principler   nltk.tokenize.mwer   nltk.tokenize.punktr	   nltk.tokenize.regexpr
   r   r   r   r   r   r   nltk.tokenize.reppr   nltk.tokenize.sexprr   r   nltk.tokenize.simpler   r   r   r   !nltk.tokenize.sonority_sequencingr    nltk.tokenize.stanford_segmenterr   nltk.tokenize.texttilingr   nltk.tokenize.toktokr   nltk.tokenize.treebankr   r   nltk.tokenize.utilr   r   r'   r,   r3   r+   r(   r&   <module>rF      s  2 2h 
			       @ @ @ @ @ @ @ @ 7 7 7 7 7 7 F F F F F F * * * * * * 6 6 6 6 6 6                  - , , , , , > > > > > > > >            @ ? ? ? ? ? > > > > > > 8 8 8 8 8 8 0 0 0 0 0 0 Q Q Q Q Q Q Q Q I I I I I I I I$ $ $ $ -,..      r(   