RezaGooner
diff --git a/‎PerSent.egg-info/PKG-INFO
Lines changed: 121 additions & 0 deletions b/‎PerSent.egg-info/PKG-INFO
Lines changed: 121 additions & 0 deletions
diff --git a/‎PerSent.egg-info/SOURCES.txt
Lines changed: 11 additions & 0 deletions b/‎PerSent.egg-info/SOURCES.txt
Lines changed: 11 additions & 0 deletions
diff --git a/‎PerSent.egg-info/dependency_links.txt
Lines changed: 1 addition & 0 deletions b/‎PerSent.egg-info/dependency_links.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎PerSent.egg-info/requires.txt
Lines changed: 6 additions & 0 deletions b/‎PerSent.egg-info/requires.txt
Lines changed: 6 additions & 0 deletions
diff --git a/‎PerSent.egg-info/top_level.txt
Lines changed: 1 addition & 0 deletions b/‎PerSent.egg-info/top_level.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎PerSent/CommentAnalyzer.py
Lines changed: 199 additions & 0 deletions b/‎PerSent/CommentAnalyzer.py
Lines changed: 199 additions & 0 deletions
diff --git a/‎PerSent/__init__.py
Lines changed: 4 additions & 0 deletions b/‎PerSent/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎PerSent/model/classifier.joblib
3.22 KB b/‎PerSent/model/classifier.joblib
3.22 KB
diff --git a/‎PerSent/model/word2vec.model
28.8 MB b/‎PerSent/model/word2vec.model
28.8 MB
@@ -0,0 +1,121 @@
+Metadata-Version: 2.4
+Name: PerSent
+Version: 1.0.2
+Summary: Persian Sentiment Analysis Toolkit
+Home-page: https://github.com/RezaGooner/PerSent
+Author: RezaGooner
+Author-email: RezaAsadiProgrammer@Gmail.com
+Keywords: persian sentiment analysis nlp
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+Requires-Dist: hazm>=0.7.0
+Requires-Dist: gensim>=4.0.0
+Requires-Dist: scikit-learn>=1.0.0
+Requires-Dist: pandas>=1.3.0
+Requires-Dist: tqdm>=4.62.0
+Requires-Dist: joblib>=1.1.0
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+
+# PerSent - Persian Sentiment Analyzer
+[![فارسی](https://img.shields.io/badge/Persian-فارسی-blue.svg)](README.fa.md)
+
+
+![PerSent Logo](https://github.com/user-attachments/assets/6bb1633b-6ed3-47fa-aae2-f97886dc4e22)
+
+## Introduction
+PerSent is a Python library designed for Persian sentiment analysis. The name stands for "Persian Sentiment Analyzer". Currently in its early testing phase, PerSent provides tools for analyzing sentiment in Persian text, particularly useful for product reviews and service feedback.
+
+## Features
+- Sentiment classification into three categories:
+  - `recommended`
+  - `not_recommended` 
+  - `no_idea`
+- Single text analysis
+- Batch processing from CSV files
+- Summary report generation
+
+## Installation
+Install the latest version using pip:
+
+```bash
+pip install PerSent
+```
+For a specific version:
+
+``` bash
+pip install PerSent==<VERSION_NUMBER>
+```
+
+## Basic Usage
+### Single Text Analysis
+``` bash
+from PerSent import CommentAnalyzer
+
+# Initialize analyzer
+analyzer = CommentAnalyzer()
+
+# Load pre-trained model
+analyzer.load_model()
+
+# Analyze text
+text = "کیفیت عالی داشت"
+result = analyzer.predict(text)
+print(f"Sentiment: {result}")
+# Output: Sentiment: recommended
+```
+
+### Training Your Own Model
+``` bash
+'''
+Train the model using a CSV file containing:
+- Comments
+- Recommendation status (recommended/not_recommended/no_idea)
+'''
+analyzer.train("train.csv")
+```
+
+## Batch Processing
+### CSV Processing
+
+``` bash
+analyzer.csvPredict(
+    input_csv="comments.csv",
+    output_path="results.csv"
+)
+```
+
+### Advanced CSV Processing Options
+``` bash
+# Using column index
+analyzer.csvPredict("comments.csv", "results.csv", None, 0)
+
+# Using column name  
+analyzer.csvPredict("comments.csv", "results.csv", None, "Comments")
+
+# With summary report
+analyzer.csvPredict("comments.csv", "results.csv", "summary.csv")
+```
+
+## Dataset
+A sample training dataset is available:
+[Download Dataset](https://github.com/RezaGooner/Sentiment-Survey-Analyzer/tree/main/Dataset/big_train)
+
+## Contribution
+We welcome contributions and feedback:
+
+- [Fork Repository & Pull Request](https://github.com/RezaGooner/PerSent/fork)
+- [Make Issue](https://github.com/RezaGooner/PerSent/issues/new)
+- E-Mail : ```RezaAsadiProgrammer@gmail.com```
+- Telegram : ```@RezaGooner```
@@ -0,0 +1,11 @@
+README.md
+setup.py
+PerSent/CommentAnalyzer.py
+PerSent/__init__.py
+PerSent.egg-info/PKG-INFO
+PerSent.egg-info/SOURCES.txt
+PerSent.egg-info/dependency_links.txt
+PerSent.egg-info/requires.txt
+PerSent.egg-info/top_level.txt
+PerSent/model/classifier.joblib
+PerSent/model/word2vec.model
@@ -0,0 +1 @@
+
@@ -0,0 +1,6 @@
+hazm>=0.7.0
+gensim>=4.0.0
+scikit-learn>=1.0.0
+pandas>=1.3.0
+tqdm>=4.62.0
+joblib>=1.1.0
@@ -0,0 +1 @@
+PerSent
@@ -0,0 +1,199 @@
+#import necessary library
+import pandas as pd
+from hazm import Normalizer, word_tokenize, Stemmer, stopwords_list
+import re
+from tqdm import tqdm
+from gensim.models import Word2Vec
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+import os
+import joblib
+
+class CommentAnalyzer:
+    def __init__(self, model_dir='PerSent/model'):
+        self.normalizer = Normalizer()
+        self.stemmer = Stemmer()
+        self.stopwords = set(stopwords_list())
+        self.model_dir = model_dir
+        self.vectorizer = None
+        self.classifier = None
+        
+        # make /model Directory if not exist
+        os.makedirs(self.model_dir, exist_ok=True)
+        
+    def _preprocess_text(self, text):
+        """PreProcess Persian Text"""
+        # Normalizing
+        text = self.normalizer.normalize(str(text))
+        
+        # remove number and sign
+        text = re.sub(r'[!()-\[\]{};:\'",؟<>./?@#$%^&*_~۰-۹\d]+', ' ', text)
+        text = re.sub(r'\s+', ' ', text).strip()
+        
+        # tokenize and stemming
+        tokens = word_tokenize(text)
+        processed_tokens = [
+            self.stemmer.stem(token)
+            for token in tokens
+            if token not in self.stopwords and len(token) > 1
+        ]
+        
+        return processed_tokens
+    
+    def _sentence_vector(self, sentence, model):
+        """convert sentences to vector by word2vec model"""
+        vectors = []
+        for word in sentence:
+            try:
+                vectors.append(model.wv[word])
+            except KeyError:
+                vectors.append(np.zeros(100))
+        return np.mean(vectors, axis=0) if vectors else np.zeros(100)
+    
+    def train(self, train_csv, test_size=0.2, vector_size=100, window=5):
+        """Train model"""
+        # read data
+        df = pd.read_csv(train_csv)
+        df['tokens'] = df['body'].apply(self._preprocess_text)
+        
+        # train Word2Vec model
+        self.vectorizer = Word2Vec(
+            sentences=df['tokens'],
+            vector_size=vector_size,
+            window=window,
+            min_count=1,
+            workers=4
+        )
+        
+        # convert sentences to vector
+        X = np.array([self._sentence_vector(s, self.vectorizer) for s in df['tokens']])
+        y = df['recommendation_status'].map({
+            "no_idea": 2,
+            "recommended": 1,
+            "not_recommended": 0
+        }).values
+        
+        # make train and test data
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
+
+        self.classifier = LogisticRegression(max_iter=1000)
+        self.classifier.fit(X_train, y_train)
+        
+        # save model
+        self.save_model()
+        
+        # evaluation
+        accuracy = self.classifier.score(X_test, y_test)
+        return accuracy
+    
+    def predict(self, text):
+        """Predict text sentiment"""
+        if not self.classifier or not self.vectorizer:
+            raise Exception("Model not trained! Call train() first or load a pretrained model.")
+            
+        tokens = self._preprocess_text(text)
+        vector = self._sentence_vector(tokens, self.vectorizer)
+        prediction = self.classifier.predict([vector])[0]
+        
+        return {
+            0: "not_recommended",
+            1: "recommended",
+            2: "no_idea"
+        }[prediction]
+    
+    def save_model(self):
+        """save trained model"""
+        joblib.dump(self.classifier, os.path.join(self.model_dir, 'classifier.joblib'))
+        self.vectorizer.save(os.path.join(self.model_dir, 'word2vec.model'))
+    
+    def load_model(self):
+        """reload from file"""
+        self.classifier = joblib.load(os.path.join(self.model_dir, 'classifier.joblib'))
+        self.vectorizer = Word2Vec.load(os.path.join(self.model_dir, 'word2vec.model'))
+        
+    def csvPredict(self, input_csv, output_path, summary_path=None, text_column=0):
+        """
+        Analyze sentiment for comments in a CSV file and save results
+        
+        Parameters:
+            input_csv (str): Path to input CSV file
+            output_path (str): Path to save output CSV file
+            text_column (str/int, optional): Name or index (0-based) of column containing comments. 
+                                          Defaults to 0 (first column).
+            summary_path (str, optional): Path to save prediction summary report.
+                                       If None, no summary will be saved.
+        """
+        try:
+            # Read input CSV
+            df = pd.read_csv(input_csv)
+            
+            # Determine the correct column
+            if isinstance(text_column, int):
+                # Handle negative indices
+                if text_column < 0:
+                    text_column = len(df.columns) + text_column
+                    
+                if text_column >= len(df.columns) or text_column < 0:
+                    raise ValueError(f"Column index {text_column} is out of range")
+                    
+                column_name = df.columns[text_column]
+            else:
+                if text_column not in df.columns:
+                    raise ValueError(f"Column '{text_column}' not found in CSV file")
+                column_name = text_column
+            
+            # Analyze each comment
+            tqdm.pandas(desc="Analyzing comments")
+            df['sentiment'] = df[column_name].progress_apply(self.predict)
+            
+            # Save results
+            df.to_csv(output_path, index=False, encoding='utf-8-sig')
+            print(f"Results saved to {output_path}")
+            
+            # Generate and save summary if requested
+            if summary_path:
+                summary = self._generate_summary(df)
+                summary.to_csv(summary_path, index=False, encoding='utf-8-sig')
+                print(f"Summary report saved to {summary_path}")
+            
+            return df
+            
+        except Exception as e:
+            print(f"Error: {str(e)}")
+            return None
+
+    def _generate_summary(self, df):
+        """Generate prediction summary statistics"""
+        # Count each sentiment
+        counts = df['sentiment'].value_counts().to_dict()
+        
+        # Create summary dataframe
+        summary = pd.DataFrame({
+            'Category': [
+                'Recommended',
+                'Not Recommended', 
+                'No Idea',
+                'Total',
+                'Model Accuracy'
+            ],
+            'Count': [
+                counts.get('recommended', 0),
+                counts.get('not_recommended', 0),
+                counts.get('no_idea', 0),
+                len(df),
+                'N/A'  # Accuracy needs to be calculated during training
+            ],
+            'Percentage': [
+                f"{100 * counts.get('recommended', 0) / len(df):.2f}%",
+                f"{100 * counts.get('not_recommended', 0) / len(df):.2f}%",
+                f"{100 * counts.get('no_idea', 0) / len(df):.2f}%",
+                '100%',
+                'N/A'
+            ]
+        })
+        
+        return summary
+
+
+# Github : RezaGooner
@@ -0,0 +1,4 @@
+from .CommentAnalyzer import CommentAnalyzer
+
+__version__ = "1.0.2"
+__all__ = ['CommentAnalyzer']