
    '[f                      b    d Z ddlZddlZddlmZ ddlmZmZmZ ddl	m
Z
  G d de          ZdS )z{
A reader for corpora that consist of Tweets. It is assumed that the Tweets
have been serialised into line-delimited JSON.
    N)CorpusReader)StreamBackedCorpusViewZipFilePathPointerconcat)TweetTokenizerc                   T    e Zd ZdZeZ	 d e            dfdZd	dZd	dZ	d	dZ
d ZdS )
TwitterCorpusReadera7  
    Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.

    Individual Tweets can be tokenized using the default tokenizer, or by a
    custom tokenizer specified as a parameter to the constructor.

    Construct a new Tweet corpus reader for a set of documents
    located at the given root directory.

    If you made your own tweet collection in a directory called
    `twitter-files`, then you can initialise the reader as::

        from nltk.corpus import TwitterCorpusReader
        reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')

    However, the recommended approach is to set the relevant directory as the
    value of the environmental variable `TWITTER`, and then invoke the reader
    as follows::

       root = os.environ['TWITTER']
       reader = TwitterCorpusReader(root, '.*\.json')

    If you want to work directly with the raw Tweets, the `json` library can
    be used::

       import json
       for tweet in reader.docs():
           print(json.dumps(tweet, indent=1, sort_keys=True))

    Nutf8c                    t          j        | |||           |                     | j                  D ]N}t	          |t
                    rt          j                            |          dk    rt          d| d          O	 || _
        dS )a  
        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
            smaller units, including but not limited to words.
        r   zFile z	 is emptyN)r   __init__abspaths_fileids
isinstancer   ospathgetsize
ValueError_word_tokenizer)selfrootfileidsword_tokenizerencodingr   s         N/var/www/piapp/venv/lib/python3.11/site-packages/nltk/corpus/reader/twitter.pyr   zTwitterCorpusReader.__init__:   s     	dD'8<<<MM$-00 	: 	:D$ 233 :&&!++ !8!8!8!8999 ,E-    c                 d     t           fd                     |dd          D                       S )a(  
        Returns the full Tweet objects, as specified by `Twitter
        documentation on Tweets
        <https://dev.twitter.com/docs/platform-objects/tweets>`_

        :return: the given file(s) as a list of dictionaries deserialised
            from JSON.
        :rtype: list(dict)
        c                 R    g | ]#\  }}}                     |j        |           $S ))r   )
CorpusView_read_tweets).0r   encfileidr   s       r   
<listcomp>z,TwitterCorpusReader.docs.<locals>.<listcomp>Y   sD       'T3 d&7#FF  r   T)r   r   )r   r   s   ` r   docszTwitterCorpusReader.docsN   sM        +/==$+M+M  
 
 	
r   c                     |                      |          }g }|D ]_}	 |d         }t          |t                    r|                    | j                  }|                    |           P# t          $ r Y \w xY w|S )z
        Returns only the text content of Tweets in the file(s)

        :return: the given file(s) as a list of Tweets.
        :rtype: list(str)
        text)r$   r   bytesdecoder   appendKeyError)r   r   
fulltweetstweetsjsonor&   s         r   stringszTwitterCorpusReader.strings_   s     YYw''
 	 	EV}dE** 6;;t}55Dd####   s   AA**
A76A7c                 X    |                      |          }| j        fd|D             S )z
        :return: the given file(s) as a list of the text content of Tweets as
            as a list of words, screenanames, hashtags, URLs and punctuation symbols.

        :rtype: list(list(str))
        c                 :    g | ]}                     |          S  )tokenize)r    t	tokenizers     r   r#   z1TwitterCorpusReader.tokenized.<locals>.<listcomp>{   s'    666!	""1%%666r   )r.   r   )r   r   r,   r4   s      @r   	tokenizedzTwitterCorpusReader.tokenizedr   s8     g&&(	6666v6666r   c                     g }t          d          D ]E}|                                }|s|c S t          j        |          }|                    |           F|S )zS
        Assumes that each line in ``stream`` is a JSON-serialised object.
        
   )rangereadlinejsonloadsr)   )r   streamr,   ilinetweets         r   r   z TwitterCorpusReader._read_tweets}   sg     r 	! 	!A??$$D Jt$$EMM%    r   )N)__name__
__module____qualname____doc__r   r   r   r   r$   r.   r5   r   r1   r   r   r	   r	      s         > (J
 !1A1AF. . . .(
 
 
 
"   &	7 	7 	7 	7    r   r	   )rC   r:   r   nltk.corpus.reader.apir   nltk.corpus.reader.utilr   r   r   nltk.tokenizer   r	   r1   r   r   <module>rG      s    
  				 / / / / / / V V V V V V V V V V ( ( ( ( ( (s s s s s, s s s s sr   