
    '[f!0                         d Z ddlZddlT ddlT  ej        d          Z ej        d          Z ej        d          Z ej        d          Z G d d	          Z	 G d
 d          Z
 G d de          ZdS )a	  
CorpusReader for reviews corpora (syntax based on Customer Review Corpus).

Customer Review Corpus information
==================================

Annotated by: Minqing Hu and Bing Liu, 2004.
    Department of Computer Science
    University of Illinois at Chicago

Contact: Bing Liu, liub@cs.uic.edu
        https://www.cs.uic.edu/~liub

Distributed with permission.

The "product_reviews_1" and "product_reviews_2" datasets respectively contain
annotated customer reviews of 5 and 9 products from amazon.com.

Related papers:

- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
    Proceedings of the ACM SIGKDD International Conference on Knowledge
    Discovery & Data Mining (KDD-04), 2004.

- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
    Proceedings of Nineteeth National Conference on Artificial Intelligence
    (AAAI-2004), 2004.

- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
    Opinion Mining." Proceedings of First ACM International Conference on Web
    Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
    Stanford, California, USA.

Symbols used in the annotated reviews:

    :[t]: the title of the review: Each [t] tag starts a review.
    :xxxx[+|-n]: xxxx is a product feature.
    :[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
           Note that the strength is quite subjective.
           You may want ignore it, but only considering + and -
    :[-n]: Negative opinion
    :##:   start of each sentence. Each line is a sentence.
    :[u]:  feature not appeared in the sentence.
    :[p]:  feature not appeared in the sentence. Pronoun resolution is needed.
    :[s]:  suggestion or recommendation.
    :[cc]: comparison with a competing product from a different brand.
    :[cs]: comparison with a competing product from the same brand.

Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
    provide separation between different reviews. This is due to the fact that
    the dataset was specifically designed for aspect/feature-based sentiment
    analysis, for which sentence-level annotation is sufficient. For document-
    level classification and analysis, this peculiarity should be taken into
    consideration.
    N)*z^\[t\](.*)$z%((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]z\[(?!t)(p|u|s|cc|cs)\]z##(.*)$c                   2    e Zd ZdZddZd Zd Zd Zd ZdS )	Reviewz>
    A Review is the main block of a ReviewsCorpusReader.
    Nc                 8    || _         |	g | _        dS || _        dS )z
        :param title: the title of the review.
        :param review_lines: the list of the ReviewLines that belong to the Review.
        N)titlereview_lines)selfr   r   s      N/var/www/piapp/venv/lib/python3.11/site-packages/nltk/corpus/reader/reviews.py__init__zReview.__init__R   s-    
 
 "D ,D    c                 h    t          |t                    sJ | j                            |           dS )z
        Add a line (ReviewLine) to the review.

        :param review_line: a ReviewLine instance that belongs to the Review.
        N)
isinstance
ReviewLiner   append)r	   review_lines     r
   add_linezReview.add_line]   s7     +z22222  -----r   c                 R    g }| j         D ]}|                    |j                   |S )a  
        Return a list of features in the review. Each feature is a tuple made of
        the specific item feature and the opinion strength about that feature.

        :return: all features of the review as a list of tuples (feat, score).
        :rtype: list(tuple)
        )r   extendfeatures)r	   r   r   s      r
   r   zReview.featuresf   s8     , 	2 	2KOOK01111r   c                 $    d | j         D             S )z
        Return all tokenized sentences in the review.

        :return: all sentences of the review as lists of tokens.
        :rtype: list(list(str))
        c                     g | ]	}|j         
S  )sent).0r   s     r
   
<listcomp>z Review.sents.<locals>.<listcomp>z   s    FFF[ FFFr   )r   r	   s    r
   sentszReview.sentss   s     GFD4EFFFFr   c                 B    d                     | j        | j                  S )Nz#Review(title="{}", review_lines={}))formatr   r   r   s    r
   __repr__zReview.__repr__|   s$    4;;J)
 
 	
r   NN)	__name__
__module____qualname____doc__r   r   r   r   r    r   r   r
   r   r   M   sq         	- 	- 	- 	-. . .  G G G
 
 
 
 
r   r   c                        e Zd ZdZddZd ZdS )r   z
    A ReviewLine represents a sentence of the review, together with (optional)
    annotations of its features and notes about the reviewed item.
    Nc                 Z    || _         |g | _        n|| _        |	g | _        d S || _        d S Nr   r   notes)r	   r   r   r*   s       r
   r   zReviewLine.__init__   s:    	DMM$DM=DJJJDJJJr   c                 N    d                     | j        | j        | j                  S )Nz*ReviewLine(features={}, notes={}, sent={}))r   r   r*   r   r   s    r
   r    zReviewLine.__repr__   s'    ;BBM4:ty
 
 	
r   r!   )r"   r#   r$   r%   r   r    r   r   r
   r   r      sA         

 
 
 

 
 
 
 
r   r   c                   j    e Zd ZdZeZ e            dfdZddZddZ	ddZ
ddZd	 Zd
 Zd Zd ZdS )ReviewsCorpusReadera  
    Reader for the Customer Review Data dataset by Hu, Liu (2004).
    Note: we are not applying any sentence tokenization at the moment, just word
    tokenization.

        >>> from nltk.corpus import product_reviews_1
        >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
        >>> review = camera_reviews[0]
        >>> review.sents()[0] # doctest: +NORMALIZE_WHITESPACE
        ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
        'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
        >>> review.features() # doctest: +NORMALIZE_WHITESPACE
        [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
        ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
        ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
        ('option', '+1')]

    We can also reach the same information directly from the stream:

        >>> product_reviews_1.features('Canon_G3.txt')
        [('canon powershot g3', '+3'), ('use', '+2'), ...]

    We can compute stats for specific product features:

        >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
        >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
        >>> mean = tot / n_reviews
        >>> print(n_reviews, tot, mean)
        15 24 1.6
    utf8c                 \    t                               | |||           || _        d| _        dS )ad  
        :param root: The root directory for the corpus.
        :param fileids: a list or regexp specifying the fileids in the corpus.
        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
            into words. Default: `WordPunctTokenizer`
        :param encoding: the encoding that should be used to read the corpus.
        z
README.txtN)CorpusReaderr   _word_tokenizer_readme)r	   rootfileidsword_tokenizerencodings        r
   r   zReviewsCorpusReader.__init__   s1     	dD'8<<<-#r   Nc                      | j         }nt          |t                    r|g}t           fd                     |d          D                       S )au  
        Return a list of features. Each feature is a tuple made of the specific
        item feature and the opinion strength about that feature.

        :param fileids: a list or regexp specifying the ids of the files whose
            features have to be returned.
        :return: all features for the item(s) in the given file(s).
        :rtype: list(tuple)
        Nc                 P    g | ]"\  }}                     |j        |           #S )r6   )
CorpusView_read_featuresr   fileidencr	   s      r
   r   z0ReviewsCorpusReader.features.<locals>.<listcomp>   sB       !VS (;cJJ  r   T)_fileidsr   strconcatabspathsr	   r4   s   ` r
   r   zReviewsCorpusReader.features   sr     ?mGG%% 	 iG   %)]]7D%A%A  
 
 	
r   c                 t     | j         }t           fd                     |d          D                       S )aS  
        Return all the reviews as a list of Review objects. If `fileids` is
        specified, return all the reviews from each of the specified files.

        :param fileids: a list or regexp specifying the ids of the files whose
            reviews have to be returned.
        :return: the given file(s) as a list of reviews.
        Nc                 P    g | ]"\  }}                     |j        |           #S r9   )r:   _read_review_blockr<   s      r
   r   z/ReviewsCorpusReader.reviews.<locals>.<listcomp>   sB       !VS (?#NN  r   T)r?   rA   rB   rC   s   ` r
   reviewszReviewsCorpusReader.reviews   sW     ?mG   %)]]7D%A%A  
 
 	
r   c                 d     t           fd                     |dd          D                       S )aY  
        Return all sentences in the corpus or in the specified files.

        :param fileids: a list or regexp specifying the ids of the files whose
            sentences have to be returned.
        :return: the given file(s) as a list of sentences, each encoded as a
            list of word strings.
        :rtype: list(list(str))
        c                 R    g | ]#\  }}}                     |j        |           $S r9   )r:   _read_sent_blockr   pathr>   r=   r	   s       r
   r   z-ReviewsCorpusReader.sents.<locals>.<listcomp>   D       'T3 d&;cJJ  r   TrA   rB   rC   s   ` r
   r   zReviewsCorpusReader.sents   M        +/==$+M+M  
 
 	
r   c                 d     t           fd                     |dd          D                       S )aK  
        Return all words and punctuation symbols in the corpus or in the specified
        files.

        :param fileids: a list or regexp specifying the ids of the files whose
            words have to be returned.
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        c                 R    g | ]#\  }}}                     |j        |           $S r9   )r:   _read_word_blockrK   s       r
   r   z-ReviewsCorpusReader.words.<locals>.<listcomp>  rM   r   TrN   rC   s   ` r
   wordszReviewsCorpusReader.words  rO   r   c                     g }t          d          D ]I}|                                }|s|c S |                    t          j        t
          |                     J|S )N   )rangereadliner   refindallFEATURES)r	   streamr   ilines        r
   r;   z"ReviewsCorpusReader._read_features  sb    r 	8 	8A??$$D  OOBJx667777r   c                    	 |                                 }|sg S t          j        t          |          }|r6t	          |                    d                                                    }nk	 |                                }|                                 }|s|gS t          j        t          |          r|                    |           |gS t          j	        t          |          }t          j	        t          |          }t          j	        t          |          }|r | j                            |d                   }t          |||          }	|                    |	           )NT   )r   r   r)   )rW   rX   matchTITLEr   groupstriptellseekrY   rZ   NOTESSENTr1   tokenizer   r   )
r	   r[   r]   title_matchreviewoldposfeatsr*   r   r   s
             r
   rF   z&ReviewsCorpusReader._read_review_block  sV   		??$$D 	(5$//K %++A..4466   			)[[]]F??$$D  x xt$$  F###xJx..EJud++E:dD))D >+44T!W==$$eLLLKOOK(((%	)r   c                     g }|                      |          D ]3}|                    d |                                D                        4|S )Nc                     g | ]}|S r   r   )r   r   s     r
   r   z8ReviewsCorpusReader._read_sent_block.<locals>.<listcomp>A  s    :::4$:::r   )rF   r   r   )r	   r[   r   rj   s       r
   rJ   z$ReviewsCorpusReader._read_sent_block>  sU    --f55 	< 	<FLL::6<<>>:::;;;;r   c                     g }t          d          D ]e}|                                }t          j        t          |          }|r3|                    | j                            |d                              f|S )NrU   r   )rV   rW   rX   rY   rg   r   r1   rh   )r	   r[   rS   r\   r]   r   s         r
   rR   z$ReviewsCorpusReader._read_word_blockD  st    r 	E 	EA??$$D:dD))D ET1::47CCDDDr   r(   )r"   r#   r$   r%   StreamBackedCorpusViewr:   WordPunctTokenizerr   r   rG   r   rS   r;   rF   rJ   rR   r   r   r
   r-   r-      s         > (J -?,>,@,@6$ $ $ $
 
 
 
*
 
 
 
$
 
 
 
"
 
 
 
"  ) ) )B      r   r-   )r%   rX   nltk.corpus.reader.apinltk.tokenizecompilera   rZ   rf   rg   r   r   r0   r-   r   r   r
   <module>ru      s  6 6p 
			 $ $ $ $    
>""2:,  	
,--rz*2
 2
 2
 2
 2
 2
 2
 2
j
 
 
 
 
 
 
 
0q q q q q, q q q q qr   