Commit
·
c51e926
0
Parent(s):
Initial commit: Business Intelligence Dashboard with Git LFS
Browse files- .gitattributes +5 -0
- README.md +232 -0
- app.py +1379 -0
- data/.DS_Store +0 -0
- data/Airbnb.csv +3 -0
- data/Online_Retail.xlsx +3 -0
- data_processor.py +819 -0
- insights.py +897 -0
- requirements.txt +7 -0
- tests/__pycache__/conftest.cpython-310-pytest-8.4.2.pyc +0 -0
- tests/__pycache__/test_data_processor.cpython-310-pytest-8.4.2.pyc +0 -0
- tests/__pycache__/test_insights.cpython-310-pytest-8.4.2.pyc +0 -0
- tests/__pycache__/test_utils.cpython-310-pytest-8.4.2.pyc +0 -0
- tests/__pycache__/test_visualizations.cpython-310-pytest-8.4.2.pyc +0 -0
- tests/conftest.py +5 -0
- tests/test_app.py +0 -0
- tests/test_data_processor.py +453 -0
- tests/test_insights.py +554 -0
- tests/test_utils.py +436 -0
- tests/test_visualizations.py +665 -0
- utils.py +480 -0
- visualizations.py +760 -0
.gitattributes
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Auto detect text files and perform LF normalization
|
| 2 |
+
* text=auto
|
| 3 |
+
data/*.csv filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
data/*.xlsx filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
data/*.xls filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 📊 Business Intelligence Dashboard
|
| 2 |
+
|
| 3 |
+
A professional, interactive Business Intelligence dashboard built with Gradio that enables non-technical stakeholders to explore and analyze business data. The application allows users to upload datasets, apply filters, generate visualizations, and extract actionable insights—all through an intuitive web interface.
|
| 4 |
+
|
| 5 |
+
## 🌟 Features
|
| 6 |
+
|
| 7 |
+
### 📂 Data Management
|
| 8 |
+
- **Pre-loaded Datasets**: Online Retail and Airbnb datasets included
|
| 9 |
+
- **Custom Upload**: Support for CSV, Excel (.xlsx, .xls), JSON, and Parquet files (max 50MB)
|
| 10 |
+
- **Automatic Data Cleaning**: Handles missing values, type conversions, and duplicate removal
|
| 11 |
+
- **Data Validation**: Comprehensive error handling and user-friendly error messages
|
| 12 |
+
|
| 13 |
+
### 📈 Statistics & Profiling
|
| 14 |
+
- **Automated Data Profiling**: Get instant insights into your dataset
|
| 15 |
+
- **Numerical Summary**: Mean, median, std deviation, quartiles, min/max
|
| 16 |
+
- **Categorical Analysis**: Unique values, value counts, mode
|
| 17 |
+
- **Missing Values Report**: Identify data quality issues
|
| 18 |
+
- **Correlation Matrix**: Visual correlation heatmap for numerical features
|
| 19 |
+
|
| 20 |
+
### 🔍 Interactive Filtering
|
| 21 |
+
- **Dynamic Filters**: Filter by numerical ranges, categorical values, or date ranges
|
| 22 |
+
- **Real-time Updates**: See row counts update as you apply filters
|
| 23 |
+
- **Multiple Filters**: Combine multiple filters for precise data exploration
|
| 24 |
+
- **Filter Management**: Easy to add, view, and clear filters
|
| 25 |
+
|
| 26 |
+
### 📉 Smart Visualizations
|
| 27 |
+
- **AI-Powered Recommendations**: Get intelligent visualization suggestions based on your data
|
| 28 |
+
- **One-Click Creation**: Create recommended visualizations with a single click
|
| 29 |
+
- **5 Visualization Types**:
|
| 30 |
+
- Time Series Plots (with aggregation: sum, mean, count, median)
|
| 31 |
+
- Distribution Plots (histogram, box plot)
|
| 32 |
+
- Category Analysis (bar chart, pie chart)
|
| 33 |
+
- Scatter Plots (with color coding and trend lines)
|
| 34 |
+
- Correlation Heatmap
|
| 35 |
+
- **Dual Backend**: Supports both Matplotlib and Plotly
|
| 36 |
+
- **Customization**: Full control over columns, aggregations, and visual parameters
|
| 37 |
+
|
| 38 |
+
### 💡 Automated Insights
|
| 39 |
+
- **Top/Bottom Performers**: Identify highest and lowest values
|
| 40 |
+
- **Trend Analysis**: Detect patterns over time with growth rate and volatility
|
| 41 |
+
- **Anomaly Detection**: Find outliers using Z-score or IQR methods
|
| 42 |
+
- **Distribution Analysis**: Understand data distributions with skewness and kurtosis
|
| 43 |
+
- **Correlation Insights**: Discover strong relationships between variables
|
| 44 |
+
|
| 45 |
+
### 💾 Export Capabilities
|
| 46 |
+
- **Data Export**: Export filtered data as CSV or Excel
|
| 47 |
+
- **Visualization Export**: Save charts as PNG images
|
| 48 |
+
|
| 49 |
+
## 🏗️ Architecture & Design
|
| 50 |
+
|
| 51 |
+
### SOLID Principles Implementation
|
| 52 |
+
- **Single Responsibility**: Each class has one clear purpose
|
| 53 |
+
- **Open/Closed**: Extensible through Strategy Pattern without modifying existing code
|
| 54 |
+
- **Liskov Substitution**: All strategies are interchangeable
|
| 55 |
+
- **Interface Segregation**: Specific interfaces for different operations
|
| 56 |
+
- **Dependency Inversion**: Depends on abstractions, not concrete implementations
|
| 57 |
+
|
| 58 |
+
### Design Patterns
|
| 59 |
+
- **Strategy Pattern**: Used for data loading, visualizations, and insights
|
| 60 |
+
- **Facade Pattern**: DataProcessor provides simple interface to complex operations
|
| 61 |
+
- **Factory Pattern**: Dynamic strategy selection based on file type
|
| 62 |
+
|
| 63 |
+
### Project Structure
|
| 64 |
+
```
|
| 65 |
+
Business-Intelligence-Dashboard/
|
| 66 |
+
├── app.py # Main Gradio application with 6 tabs
|
| 67 |
+
├── data_processor.py # Data loading, cleaning, filtering (Strategy Pattern)
|
| 68 |
+
├── visualizations.py # Chart creation with multiple strategies
|
| 69 |
+
├── insights.py # Automated insight generation
|
| 70 |
+
├── utils.py # Utility functions and validators
|
| 71 |
+
├── requirements.txt # Python dependencies
|
| 72 |
+
├── README.md # This file
|
| 73 |
+
├── data/ # Sample datasets
|
| 74 |
+
│ ├── Online_Retail.xlsx
|
| 75 |
+
│ └── Airbnb.csv
|
| 76 |
+
└── tests/ # Comprehensive test suite
|
| 77 |
+
├── init.py
|
| 78 |
+
├── conftest.py
|
| 79 |
+
├── test_utils.py
|
| 80 |
+
├── test_data_processor.py
|
| 81 |
+
├── test_visualizations.py
|
| 82 |
+
└── test_insights.py
|
| 83 |
+
```
|
| 84 |
+
## 🚀 Getting Started
|
| 85 |
+
|
| 86 |
+
### Prerequisites
|
| 87 |
+
- Python 3.8 or higher
|
| 88 |
+
- pip package manager
|
| 89 |
+
|
| 90 |
+
### Installation
|
| 91 |
+
|
| 92 |
+
1. **Clone the repository**
|
| 93 |
+
```bash
|
| 94 |
+
git clone https://github.com/YOUR_USERNAME/Business-Intelligence-Dashboard.git
|
| 95 |
+
cd Business-Intelligence-Dashboard
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
2. **Create a virtual environment**
|
| 99 |
+
```bash
|
| 100 |
+
# On macOS/Linux
|
| 101 |
+
python3 -m venv venv
|
| 102 |
+
source venv/bin/activate
|
| 103 |
+
|
| 104 |
+
# On Windows
|
| 105 |
+
python -m venv venv
|
| 106 |
+
venv\Scripts\activate
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
3. **Install dependencies**
|
| 110 |
+
```bash
|
| 111 |
+
pip install -r requirements.txt
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
4. **Run the application**
|
| 115 |
+
```bash
|
| 116 |
+
python app.py
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
The dashboard will launch and open in your default browser at `http://localhost:7860`
|
| 120 |
+
|
| 121 |
+
## 📖 Usage Guide
|
| 122 |
+
|
| 123 |
+
### 1. Loading Data
|
| 124 |
+
- **Option A**: Select "Online Retail" or "Airbnb" from the dropdown
|
| 125 |
+
- **Option B**: Upload your own dataset (CSV, Excel, JSON, or Parquet)
|
| 126 |
+
|
| 127 |
+
### 2. Exploring Statistics
|
| 128 |
+
- Navigate to "Statistics & Profiling" tab
|
| 129 |
+
- Click "Generate Data Profile" to see comprehensive statistics
|
| 130 |
+
- View missing values, numerical summaries, and correlation matrix
|
| 131 |
+
|
| 132 |
+
### 3. Filtering Data
|
| 133 |
+
- Go to "Filter & Explore" tab
|
| 134 |
+
- Select filter type (Numerical, Categorical, or Date)
|
| 135 |
+
- Choose column and set filter criteria
|
| 136 |
+
- Click "Add Filter" and see real-time updates
|
| 137 |
+
|
| 138 |
+
### 4. Creating Visualizations
|
| 139 |
+
- Navigate to "Visualizations" tab
|
| 140 |
+
- **Smart Recommendations**: Click "Get Visualization Recommendations" for AI-powered suggestions
|
| 141 |
+
- **Custom Visualizations**: Select visualization type and configure parameters
|
| 142 |
+
- Supported charts: Time Series, Distribution, Category, Scatter, Correlation
|
| 143 |
+
|
| 144 |
+
### 5. Generating Insights
|
| 145 |
+
- Go to "Insights" tab
|
| 146 |
+
- Click "Generate All Insights" for automated analysis
|
| 147 |
+
- Or select specific insight type for targeted analysis
|
| 148 |
+
|
| 149 |
+
### 6. Exporting Results
|
| 150 |
+
- Navigate to "Export" tab
|
| 151 |
+
- Choose format (CSV or Excel)
|
| 152 |
+
- Click "Export Data" to download filtered dataset
|
| 153 |
+
|
| 154 |
+
## 🧪 Testing
|
| 155 |
+
|
| 156 |
+
Run the comprehensive test suite:
|
| 157 |
+
```bash
|
| 158 |
+
# Run all tests
|
| 159 |
+
pytest tests/ -v
|
| 160 |
+
|
| 161 |
+
# Run specific test file
|
| 162 |
+
pytest tests/test_utils.py -v
|
| 163 |
+
|
| 164 |
+
# Run with coverage
|
| 165 |
+
pytest tests/ --cov=. --cov-report=html
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
Test coverage includes:
|
| 169 |
+
- **180+ test cases** across all modules
|
| 170 |
+
- Unit tests for all functions and classes
|
| 171 |
+
- Strategy Pattern implementation tests
|
| 172 |
+
- Edge case and error handling tests
|
| 173 |
+
|
| 174 |
+
## 🛠️ Technologies Used
|
| 175 |
+
|
| 176 |
+
- **Gradio**: Web interface and interactive components
|
| 177 |
+
- **Pandas**: Data manipulation and analysis
|
| 178 |
+
- **NumPy**: Numerical computations
|
| 179 |
+
- **Matplotlib/Seaborn**: Static visualizations
|
| 180 |
+
- **Plotly**: Interactive visualizations
|
| 181 |
+
- **Python 3.10+**: Core programming language
|
| 182 |
+
|
| 183 |
+
## 📊 Sample Datasets
|
| 184 |
+
|
| 185 |
+
### Online Retail Dataset
|
| 186 |
+
- **8 columns**: InvoiceNo, StockCode, Description, Quantity, InvoiceDate, UnitPrice, CustomerID, Country
|
| 187 |
+
- **Use case**: E-commerce sales analysis, product trends, customer analysis
|
| 188 |
+
|
| 189 |
+
### Airbnb Dataset
|
| 190 |
+
- **26 columns**: Including price, location, room type, reviews, availability
|
| 191 |
+
- **Use case**: Pricing analysis, location trends, booking patterns
|
| 192 |
+
|
| 193 |
+
## 🤝 Contributing
|
| 194 |
+
|
| 195 |
+
Contributions are welcome! Please follow these steps:
|
| 196 |
+
|
| 197 |
+
1. Fork the repository
|
| 198 |
+
2. Create a feature branch (`git checkout -b feature/AmazingFeature`)
|
| 199 |
+
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
| 200 |
+
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
| 201 |
+
5. Open a Pull Request
|
| 202 |
+
|
| 203 |
+
### Development Guidelines
|
| 204 |
+
- Follow PEP 8 style guidelines
|
| 205 |
+
- Add docstrings to all functions
|
| 206 |
+
- Include unit tests for new features
|
| 207 |
+
- Update README.md for significant changes
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
## 👨💻 Author
|
| 211 |
+
|
| 212 |
+
**Craig Roberts**
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
## 🙏 Acknowledgments
|
| 216 |
+
|
| 217 |
+
- Northeastern University - CS5130 Course (Prof Lino)
|
| 218 |
+
- Dataset sources: UCI ML Repository, Kaggle
|
| 219 |
+
|
| 220 |
+
## ⚡ Performance Notes
|
| 221 |
+
|
| 222 |
+
- Handles datasets up to 50MB efficiently
|
| 223 |
+
- Optimized for 1,000-10,000 rows
|
| 224 |
+
- Tested with datasets containing 100+ columns
|
| 225 |
+
- Real-time filtering with sub-second response times
|
| 226 |
+
|
| 227 |
+
## 🐛 Known Issues
|
| 228 |
+
|
| 229 |
+
- Large datasets (>100MB) may cause memory issues
|
| 230 |
+
- Some complex visualizations may take time to render
|
| 231 |
+
- Browser storage not available (by design for security)
|
| 232 |
+
---
|
app.py
ADDED
|
@@ -0,0 +1,1379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Business Intelligence Dashboard - Main Gradio Application
|
| 3 |
+
|
| 4 |
+
This application provides an interactive BI dashboard with automated insights,
|
| 5 |
+
visualizations, and data exploration capabilities.
|
| 6 |
+
|
| 7 |
+
Author: Craig
|
| 8 |
+
Date: December 2024
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import gradio as gr
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import numpy as np
|
| 14 |
+
import matplotlib.pyplot as plt
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Optional, Dict, Any, List, Tuple
|
| 17 |
+
import logging
|
| 18 |
+
|
| 19 |
+
from data_processor import DataProcessor, DataProfiler, DataFilter
|
| 20 |
+
from visualizations import VisualizationManager, save_visualization
|
| 21 |
+
from insights import InsightManager
|
| 22 |
+
from utils import (
|
| 23 |
+
get_column_types, format_number, format_percentage,
|
| 24 |
+
Config, CSVExporter, ExcelExporter
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# Configure logging
|
| 28 |
+
logging.basicConfig(level=logging.INFO)
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
# Global state management
|
| 32 |
+
class AppState:
|
| 33 |
+
"""
|
| 34 |
+
Manages application state across tabs.
|
| 35 |
+
Follows Single Responsibility Principle - only manages state.
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def __init__(self):
|
| 39 |
+
self.processor = DataProcessor()
|
| 40 |
+
self.viz_manager = VisualizationManager()
|
| 41 |
+
self.insight_manager = InsightManager()
|
| 42 |
+
|
| 43 |
+
# Available datasets
|
| 44 |
+
self.datasets = {
|
| 45 |
+
'Online Retail': 'data/Online_Retail.xlsx',
|
| 46 |
+
'Airbnb': 'data/Airbnb.csv'
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
# Current session data
|
| 50 |
+
self.current_dataset_name = None
|
| 51 |
+
self.current_df = None
|
| 52 |
+
self.filtered_df = None
|
| 53 |
+
self.active_filters = []
|
| 54 |
+
self.current_recommendations = None
|
| 55 |
+
|
| 56 |
+
def load_dataset(self, dataset_name: str, file_path: Optional[str] = None) -> Tuple[pd.DataFrame, str]:
|
| 57 |
+
"""
|
| 58 |
+
Load dataset by name or from uploaded file.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
dataset_name: Name of dataset to load
|
| 62 |
+
file_path: Optional path to uploaded file
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
Tuple of (DataFrame, status_message)
|
| 66 |
+
"""
|
| 67 |
+
try:
|
| 68 |
+
if file_path:
|
| 69 |
+
# Load uploaded file
|
| 70 |
+
df = self.processor.load_and_prepare_data(file_path)
|
| 71 |
+
self.current_dataset_name = f"Uploaded: {Path(file_path).name}"
|
| 72 |
+
else:
|
| 73 |
+
# Load predefined dataset
|
| 74 |
+
if dataset_name not in self.datasets:
|
| 75 |
+
return None, f"❌ Dataset '{dataset_name}' not found"
|
| 76 |
+
|
| 77 |
+
file_path = self.datasets[dataset_name]
|
| 78 |
+
df = self.processor.load_and_prepare_data(file_path)
|
| 79 |
+
self.current_dataset_name = dataset_name
|
| 80 |
+
|
| 81 |
+
self.current_df = df
|
| 82 |
+
self.filtered_df = df.copy()
|
| 83 |
+
self.active_filters = []
|
| 84 |
+
self.current_recommendations = None
|
| 85 |
+
|
| 86 |
+
message = f"✅ Successfully loaded '{self.current_dataset_name}' - {len(df)} rows, {len(df.columns)} columns"
|
| 87 |
+
logger.info(message)
|
| 88 |
+
return df, message
|
| 89 |
+
|
| 90 |
+
except Exception as e:
|
| 91 |
+
error_msg = f"❌ Error loading dataset: {str(e)}"
|
| 92 |
+
logger.error(error_msg)
|
| 93 |
+
return None, error_msg
|
| 94 |
+
|
| 95 |
+
def get_column_info(self) -> Dict[str, List[str]]:
|
| 96 |
+
"""Get categorized column information."""
|
| 97 |
+
if self.current_df is None:
|
| 98 |
+
return {'numerical': [], 'categorical': [], 'datetime': []}
|
| 99 |
+
return get_column_types(self.current_df)
|
| 100 |
+
|
| 101 |
+
def apply_filters(self, filters: List[Dict]) -> pd.DataFrame:
|
| 102 |
+
"""Apply filters to current dataset."""
|
| 103 |
+
if self.current_df is None:
|
| 104 |
+
return None
|
| 105 |
+
|
| 106 |
+
self.active_filters = filters
|
| 107 |
+
self.filtered_df = self.processor.apply_filters(filters)
|
| 108 |
+
return self.filtered_df
|
| 109 |
+
|
| 110 |
+
def reset_filters(self) -> pd.DataFrame:
|
| 111 |
+
"""Reset all filters."""
|
| 112 |
+
if self.current_df is None:
|
| 113 |
+
return None
|
| 114 |
+
|
| 115 |
+
self.filtered_df = self.current_df.copy()
|
| 116 |
+
self.active_filters = []
|
| 117 |
+
return self.filtered_df
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# Initialize global state
|
| 121 |
+
app_state = AppState()
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# ============================================================================
|
| 125 |
+
# SMART VISUALIZATION RECOMMENDATIONS
|
| 126 |
+
# ============================================================================
|
| 127 |
+
|
| 128 |
+
class SmartVisualizationRecommender:
|
| 129 |
+
"""
|
| 130 |
+
Recommends best visualizations based on data characteristics.
|
| 131 |
+
Follows Single Responsibility Principle - only handles recommendations.
|
| 132 |
+
"""
|
| 133 |
+
|
| 134 |
+
@staticmethod
|
| 135 |
+
def analyze_dataset(df: pd.DataFrame) -> Dict[str, Any]:
|
| 136 |
+
"""
|
| 137 |
+
Analyze dataset and recommend visualizations.
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
df: DataFrame to analyze
|
| 141 |
+
|
| 142 |
+
Returns:
|
| 143 |
+
Dict with recommendations
|
| 144 |
+
"""
|
| 145 |
+
column_types = get_column_types(df)
|
| 146 |
+
recommendations = []
|
| 147 |
+
|
| 148 |
+
# Time Series Recommendations
|
| 149 |
+
if len(column_types['datetime']) > 0 and len(column_types['numerical']) > 0:
|
| 150 |
+
recommendations.append({
|
| 151 |
+
'type': 'time_series',
|
| 152 |
+
'priority': 'high',
|
| 153 |
+
'reason': 'Detected date and numerical columns - perfect for trend analysis',
|
| 154 |
+
'suggested_params': {
|
| 155 |
+
'date_column': column_types['datetime'][0],
|
| 156 |
+
'value_column': column_types['numerical'][0],
|
| 157 |
+
'aggregation': 'sum'
|
| 158 |
+
}
|
| 159 |
+
})
|
| 160 |
+
|
| 161 |
+
# Correlation Heatmap Recommendations
|
| 162 |
+
if len(column_types['numerical']) >= 3:
|
| 163 |
+
recommendations.append({
|
| 164 |
+
'type': 'correlation',
|
| 165 |
+
'priority': 'high',
|
| 166 |
+
'reason': f'Found {len(column_types["numerical"])} numerical columns - great for correlation analysis',
|
| 167 |
+
'suggested_params': {}
|
| 168 |
+
})
|
| 169 |
+
|
| 170 |
+
# Category Analysis Recommendations
|
| 171 |
+
if len(column_types['categorical']) > 0:
|
| 172 |
+
cat_col = column_types['categorical'][0]
|
| 173 |
+
unique_count = df[cat_col].nunique()
|
| 174 |
+
|
| 175 |
+
if unique_count <= 10:
|
| 176 |
+
recommendations.append({
|
| 177 |
+
'type': 'category',
|
| 178 |
+
'priority': 'high',
|
| 179 |
+
'reason': f'Found categorical column "{cat_col}" with {unique_count} categories',
|
| 180 |
+
'suggested_params': {
|
| 181 |
+
'column': cat_col,
|
| 182 |
+
'plot_type': 'bar'
|
| 183 |
+
}
|
| 184 |
+
})
|
| 185 |
+
|
| 186 |
+
# Distribution Recommendations
|
| 187 |
+
if len(column_types['numerical']) > 0:
|
| 188 |
+
recommendations.append({
|
| 189 |
+
'type': 'distribution',
|
| 190 |
+
'priority': 'medium',
|
| 191 |
+
'reason': 'Numerical data available - useful for understanding value distribution',
|
| 192 |
+
'suggested_params': {
|
| 193 |
+
'column': column_types['numerical'][0],
|
| 194 |
+
'plot_type': 'histogram'
|
| 195 |
+
}
|
| 196 |
+
})
|
| 197 |
+
|
| 198 |
+
# Scatter Plot Recommendations
|
| 199 |
+
if len(column_types['numerical']) >= 2:
|
| 200 |
+
recommendations.append({
|
| 201 |
+
'type': 'scatter',
|
| 202 |
+
'priority': 'medium',
|
| 203 |
+
'reason': 'Multiple numerical columns - explore relationships between variables',
|
| 204 |
+
'suggested_params': {
|
| 205 |
+
'x_column': column_types['numerical'][0],
|
| 206 |
+
'y_column': column_types['numerical'][1]
|
| 207 |
+
}
|
| 208 |
+
})
|
| 209 |
+
|
| 210 |
+
# Sort by priority
|
| 211 |
+
priority_order = {'high': 0, 'medium': 1, 'low': 2}
|
| 212 |
+
recommendations.sort(key=lambda x: priority_order[x['priority']])
|
| 213 |
+
|
| 214 |
+
return {
|
| 215 |
+
'column_types': column_types,
|
| 216 |
+
'recommendations': recommendations,
|
| 217 |
+
'summary': SmartVisualizationRecommender._generate_summary(recommendations)
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
@staticmethod
|
| 221 |
+
def _generate_summary(recommendations: List[Dict]) -> str:
|
| 222 |
+
"""Generate human-readable summary of recommendations."""
|
| 223 |
+
if not recommendations:
|
| 224 |
+
return "No specific visualization recommendations available."
|
| 225 |
+
|
| 226 |
+
high_priority = [r for r in recommendations if r['priority'] == 'high']
|
| 227 |
+
|
| 228 |
+
if high_priority:
|
| 229 |
+
summary = f"🎯 **Top Recommendation**: {high_priority[0]['type'].replace('_', ' ').title()}\n"
|
| 230 |
+
summary += f"💡 {high_priority[0]['reason']}\n\n"
|
| 231 |
+
|
| 232 |
+
if len(high_priority) > 1:
|
| 233 |
+
summary += f"Also recommended: {', '.join([r['type'].replace('_', ' ').title() for r in high_priority[1:]])}"
|
| 234 |
+
else:
|
| 235 |
+
summary = f"📊 Recommended: {recommendations[0]['type'].replace('_', ' ').title()}"
|
| 236 |
+
|
| 237 |
+
return summary
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
# ============================================================================
|
| 241 |
+
# TAB 1: DATASET SELECTION
|
| 242 |
+
# ============================================================================
|
| 243 |
+
|
| 244 |
+
def create_dataset_tab():
|
| 245 |
+
"""Create dataset selection and preview tab."""
|
| 246 |
+
|
| 247 |
+
with gr.Tab("📊 Dataset Selection"):
|
| 248 |
+
gr.Markdown("## Select or Upload Dataset")
|
| 249 |
+
gr.Markdown("Choose from pre-loaded datasets or upload your own (CSV, Excel, JSON, Parquet)")
|
| 250 |
+
|
| 251 |
+
with gr.Row():
|
| 252 |
+
with gr.Column(scale=1):
|
| 253 |
+
dataset_dropdown = gr.Dropdown(
|
| 254 |
+
choices=list(app_state.datasets.keys()),
|
| 255 |
+
label="Pre-loaded Datasets",
|
| 256 |
+
value=None
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
load_btn = gr.Button("📂 Load Selected Dataset", variant="primary")
|
| 260 |
+
|
| 261 |
+
gr.Markdown("### OR Upload Your Own Dataset")
|
| 262 |
+
file_upload = gr.File(
|
| 263 |
+
label="Upload Dataset (Max 50MB)",
|
| 264 |
+
file_types=[".csv", ".xlsx", ".xls", ".json", ".parquet"]
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
upload_btn = gr.Button("📤 Upload & Process", variant="secondary")
|
| 268 |
+
|
| 269 |
+
with gr.Column(scale=1):
|
| 270 |
+
status_box = gr.Textbox(
|
| 271 |
+
label="Status",
|
| 272 |
+
value="No dataset loaded",
|
| 273 |
+
interactive=False,
|
| 274 |
+
lines=3
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
dataset_info = gr.Textbox(
|
| 278 |
+
label="Dataset Information",
|
| 279 |
+
value="",
|
| 280 |
+
interactive=False,
|
| 281 |
+
lines=8
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
gr.Markdown("### Data Preview")
|
| 285 |
+
data_preview = gr.Dataframe(
|
| 286 |
+
label="First 100 rows",
|
| 287 |
+
interactive=False,
|
| 288 |
+
wrap=True
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
# Event handlers
|
| 292 |
+
def load_predefined_dataset(dataset_name):
|
| 293 |
+
if not dataset_name:
|
| 294 |
+
return None, "⚠️ Please select a dataset", "", None
|
| 295 |
+
|
| 296 |
+
df, status = app_state.load_dataset(dataset_name)
|
| 297 |
+
|
| 298 |
+
if df is not None:
|
| 299 |
+
info = f"📊 **Dataset**: {dataset_name}\n"
|
| 300 |
+
info += f"📏 **Shape**: {df.shape[0]} rows × {df.shape[1]} columns\n"
|
| 301 |
+
info += f"💾 **Memory**: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n"
|
| 302 |
+
info += f"**Column Types**:\n"
|
| 303 |
+
|
| 304 |
+
col_types = get_column_types(df)
|
| 305 |
+
info += f"- Numerical: {len(col_types['numerical'])}\n"
|
| 306 |
+
info += f"- Categorical: {len(col_types['categorical'])}\n"
|
| 307 |
+
info += f"- DateTime: {len(col_types['datetime'])}\n"
|
| 308 |
+
|
| 309 |
+
preview = df.head(100)
|
| 310 |
+
return dataset_name, status, info, preview
|
| 311 |
+
|
| 312 |
+
return None, status, "", None
|
| 313 |
+
|
| 314 |
+
def upload_custom_dataset(file):
|
| 315 |
+
if file is None:
|
| 316 |
+
return "⚠️ Please upload a file", "", None
|
| 317 |
+
|
| 318 |
+
# Check file size (50MB limit)
|
| 319 |
+
file_size_mb = Path(file.name).stat().st_size / (1024 * 1024)
|
| 320 |
+
if file_size_mb > 50:
|
| 321 |
+
return f"❌ File too large ({file_size_mb:.1f}MB). Maximum size: 50MB", "", None
|
| 322 |
+
|
| 323 |
+
df, status = app_state.load_dataset("uploaded", file.name)
|
| 324 |
+
|
| 325 |
+
if df is not None:
|
| 326 |
+
info = f"📊 **Dataset**: {Path(file.name).name}\n"
|
| 327 |
+
info += f"📏 **Shape**: {df.shape[0]} rows × {df.shape[1]} columns\n"
|
| 328 |
+
info += f"💾 **Memory**: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n"
|
| 329 |
+
info += f"**Column Types**:\n"
|
| 330 |
+
|
| 331 |
+
col_types = get_column_types(df)
|
| 332 |
+
info += f"- Numerical: {len(col_types['numerical'])}\n"
|
| 333 |
+
info += f"- Categorical: {len(col_types['categorical'])}\n"
|
| 334 |
+
info += f"- DateTime: {len(col_types['datetime'])}\n"
|
| 335 |
+
|
| 336 |
+
preview = df.head(100)
|
| 337 |
+
return status, info, preview
|
| 338 |
+
|
| 339 |
+
return status, "", None
|
| 340 |
+
|
| 341 |
+
load_btn.click(
|
| 342 |
+
fn=load_predefined_dataset,
|
| 343 |
+
inputs=[dataset_dropdown],
|
| 344 |
+
outputs=[dataset_dropdown, status_box, dataset_info, data_preview]
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
upload_btn.click(
|
| 348 |
+
fn=upload_custom_dataset,
|
| 349 |
+
inputs=[file_upload],
|
| 350 |
+
outputs=[status_box, dataset_info, data_preview]
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
+
return dataset_dropdown, status_box, dataset_info, data_preview, load_btn, upload_btn
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
# ============================================================================
|
| 357 |
+
# TAB 2: STATISTICS & PROFILING
|
| 358 |
+
# ============================================================================
|
| 359 |
+
|
| 360 |
+
def create_statistics_tab():
|
| 361 |
+
"""Create statistics and data profiling tab."""
|
| 362 |
+
|
| 363 |
+
with gr.Tab("📈 Statistics & Profiling"):
|
| 364 |
+
gr.Markdown("## Data Profiling & Summary Statistics")
|
| 365 |
+
|
| 366 |
+
profile_btn = gr.Button("🔍 Generate Data Profile", variant="primary")
|
| 367 |
+
|
| 368 |
+
with gr.Row():
|
| 369 |
+
with gr.Column():
|
| 370 |
+
gr.Markdown("### Missing Values Report")
|
| 371 |
+
missing_values = gr.Dataframe(label="Missing Values")
|
| 372 |
+
|
| 373 |
+
with gr.Column():
|
| 374 |
+
gr.Markdown("### Numerical Summary")
|
| 375 |
+
numerical_summary = gr.Dataframe(label="Descriptive Statistics")
|
| 376 |
+
|
| 377 |
+
gr.Markdown("### Categorical Summary")
|
| 378 |
+
categorical_summary = gr.Textbox(
|
| 379 |
+
label="Categorical Variables",
|
| 380 |
+
lines=10,
|
| 381 |
+
interactive=False
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
gr.Markdown("### Correlation Matrix")
|
| 385 |
+
correlation_plot = gr.Plot(label="Correlation Heatmap")
|
| 386 |
+
|
| 387 |
+
def generate_profile():
|
| 388 |
+
if app_state.current_df is None:
|
| 389 |
+
return (
|
| 390 |
+
None, None, "⚠️ No dataset loaded. Please load a dataset first.", None
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
try:
|
| 394 |
+
profile = app_state.processor.get_data_profile()
|
| 395 |
+
|
| 396 |
+
# Missing values
|
| 397 |
+
missing_df = profile['missing_values']
|
| 398 |
+
|
| 399 |
+
# Numerical summary
|
| 400 |
+
num_summary = profile['numerical_summary']
|
| 401 |
+
|
| 402 |
+
# Categorical summary - FIXED
|
| 403 |
+
cat_summary = profile['categorical_summary']
|
| 404 |
+
cat_text = ""
|
| 405 |
+
for col, stats in cat_summary.items():
|
| 406 |
+
cat_text += f"\n**{col}**:\n"
|
| 407 |
+
cat_text += f" - Unique values: {stats['unique_count']}\n"
|
| 408 |
+
|
| 409 |
+
# Safe handling of top_value
|
| 410 |
+
top_val = stats.get('top_value', 'N/A')
|
| 411 |
+
if pd.isna(top_val):
|
| 412 |
+
top_val = 'N/A'
|
| 413 |
+
cat_text += f" - Most common: {top_val} ({stats['top_value_frequency']} occurrences)\n"
|
| 414 |
+
|
| 415 |
+
# Safe handling of value_counts
|
| 416 |
+
if stats.get('value_counts'):
|
| 417 |
+
top_values = list(stats['value_counts'].keys())[:5]
|
| 418 |
+
cat_text += f" - Top values: {', '.join(str(v) for v in top_values)}\n"
|
| 419 |
+
|
| 420 |
+
if not cat_text:
|
| 421 |
+
cat_text = "No categorical columns found."
|
| 422 |
+
|
| 423 |
+
# Correlation matrix
|
| 424 |
+
corr_matrix = profile['correlation_matrix']
|
| 425 |
+
|
| 426 |
+
if not corr_matrix.empty and len(corr_matrix.columns) >= 2:
|
| 427 |
+
fig = app_state.viz_manager.create_visualization(
|
| 428 |
+
'correlation',
|
| 429 |
+
app_state.current_df,
|
| 430 |
+
backend='matplotlib'
|
| 431 |
+
)
|
| 432 |
+
else:
|
| 433 |
+
fig = None
|
| 434 |
+
|
| 435 |
+
return missing_df, num_summary, cat_text, fig
|
| 436 |
+
|
| 437 |
+
except Exception as e:
|
| 438 |
+
logger.error(f"Error generating profile: {e}")
|
| 439 |
+
import traceback
|
| 440 |
+
traceback.print_exc()
|
| 441 |
+
return None, None, f"❌ Error: {str(e)}", None
|
| 442 |
+
|
| 443 |
+
profile_btn.click(
|
| 444 |
+
fn=generate_profile,
|
| 445 |
+
outputs=[missing_values, numerical_summary, categorical_summary, correlation_plot]
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
return profile_btn, missing_values, numerical_summary, categorical_summary, correlation_plot
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
# ============================================================================
|
| 452 |
+
# TAB 3: FILTER & EXPLORE
|
| 453 |
+
# ============================================================================
|
| 454 |
+
|
| 455 |
+
def create_filter_tab():
|
| 456 |
+
"""Create interactive filtering tab."""
|
| 457 |
+
|
| 458 |
+
with gr.Tab("🔍 Filter & Explore"):
|
| 459 |
+
gr.Markdown("## Interactive Data Filtering")
|
| 460 |
+
gr.Markdown("Apply filters to narrow down your data for analysis")
|
| 461 |
+
|
| 462 |
+
with gr.Row():
|
| 463 |
+
with gr.Column(scale=1):
|
| 464 |
+
gr.Markdown("### Filter Controls")
|
| 465 |
+
|
| 466 |
+
filter_type = gr.Radio(
|
| 467 |
+
choices=["Numerical Range", "Categorical Values", "Date Range"],
|
| 468 |
+
label="Filter Type",
|
| 469 |
+
value="Numerical Range"
|
| 470 |
+
)
|
| 471 |
+
|
| 472 |
+
column_select = gr.Dropdown(
|
| 473 |
+
label="Select Column",
|
| 474 |
+
choices=[],
|
| 475 |
+
interactive=True
|
| 476 |
+
)
|
| 477 |
+
|
| 478 |
+
# Numerical filters
|
| 479 |
+
with gr.Group(visible=True) as numerical_group:
|
| 480 |
+
min_value = gr.Number(label="Minimum Value")
|
| 481 |
+
max_value = gr.Number(label="Maximum Value")
|
| 482 |
+
|
| 483 |
+
# Categorical filters
|
| 484 |
+
with gr.Group(visible=False) as categorical_group:
|
| 485 |
+
category_select = gr.CheckboxGroup(
|
| 486 |
+
label="Select Values",
|
| 487 |
+
choices=[]
|
| 488 |
+
)
|
| 489 |
+
|
| 490 |
+
# Date filters
|
| 491 |
+
with gr.Group(visible=False) as date_group:
|
| 492 |
+
start_date = gr.Textbox(label="Start Date (YYYY-MM-DD)")
|
| 493 |
+
end_date = gr.Textbox(label="End Date (YYYY-MM-DD)")
|
| 494 |
+
|
| 495 |
+
add_filter_btn = gr.Button("➕ Add Filter", variant="primary")
|
| 496 |
+
clear_filters_btn = gr.Button("🗑️ Clear All Filters", variant="secondary")
|
| 497 |
+
|
| 498 |
+
with gr.Column(scale=2):
|
| 499 |
+
filter_status = gr.Textbox(
|
| 500 |
+
label="Active Filters",
|
| 501 |
+
value="No filters applied",
|
| 502 |
+
lines=5,
|
| 503 |
+
interactive=False
|
| 504 |
+
)
|
| 505 |
+
|
| 506 |
+
row_count = gr.Textbox(
|
| 507 |
+
label="Filtered Row Count",
|
| 508 |
+
value="0 rows",
|
| 509 |
+
interactive=False
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
filtered_preview = gr.Dataframe(
|
| 513 |
+
label="Filtered Data Preview",
|
| 514 |
+
interactive=False
|
| 515 |
+
)
|
| 516 |
+
|
| 517 |
+
def update_column_choices(filter_type_value):
|
| 518 |
+
if app_state.current_df is None:
|
| 519 |
+
return gr.Dropdown(choices=[]), gr.Group(visible=False), gr.Group(visible=False), gr.Group(visible=False)
|
| 520 |
+
|
| 521 |
+
col_types = get_column_types(app_state.current_df)
|
| 522 |
+
|
| 523 |
+
if filter_type_value == "Numerical Range":
|
| 524 |
+
choices = col_types['numerical']
|
| 525 |
+
return (
|
| 526 |
+
gr.Dropdown(choices=choices),
|
| 527 |
+
gr.Group(visible=True),
|
| 528 |
+
gr.Group(visible=False),
|
| 529 |
+
gr.Group(visible=False)
|
| 530 |
+
)
|
| 531 |
+
elif filter_type_value == "Categorical Values":
|
| 532 |
+
choices = col_types['categorical']
|
| 533 |
+
return (
|
| 534 |
+
gr.Dropdown(choices=choices),
|
| 535 |
+
gr.Group(visible=False),
|
| 536 |
+
gr.Group(visible=True),
|
| 537 |
+
gr.Group(visible=False)
|
| 538 |
+
)
|
| 539 |
+
else: # Date Range
|
| 540 |
+
choices = col_types['datetime']
|
| 541 |
+
return (
|
| 542 |
+
gr.Dropdown(choices=choices),
|
| 543 |
+
gr.Group(visible=False),
|
| 544 |
+
gr.Group(visible=False),
|
| 545 |
+
gr.Group(visible=True)
|
| 546 |
+
)
|
| 547 |
+
|
| 548 |
+
def update_category_choices(column):
|
| 549 |
+
if app_state.current_df is None or not column:
|
| 550 |
+
return gr.CheckboxGroup(choices=[])
|
| 551 |
+
|
| 552 |
+
unique_values = app_state.current_df[column].dropna().unique().tolist()
|
| 553 |
+
return gr.CheckboxGroup(choices=unique_values[:50]) # Limit to 50 for performance
|
| 554 |
+
|
| 555 |
+
def add_filter(filter_type_value, column, min_val, max_val, categories, start, end):
|
| 556 |
+
if app_state.current_df is None:
|
| 557 |
+
return "⚠️ No dataset loaded", "0 rows", None
|
| 558 |
+
|
| 559 |
+
if not column:
|
| 560 |
+
return "⚠️ Please select a column", f"{len(app_state.filtered_df)} rows", app_state.filtered_df.head(100)
|
| 561 |
+
|
| 562 |
+
# Create filter configuration
|
| 563 |
+
filter_config = {'column': column}
|
| 564 |
+
|
| 565 |
+
if filter_type_value == "Numerical Range":
|
| 566 |
+
filter_config['type'] = 'numerical'
|
| 567 |
+
filter_config['min_val'] = min_val
|
| 568 |
+
filter_config['max_val'] = max_val
|
| 569 |
+
elif filter_type_value == "Categorical Values":
|
| 570 |
+
filter_config['type'] = 'categorical'
|
| 571 |
+
filter_config['values'] = categories if categories else []
|
| 572 |
+
else: # Date Range
|
| 573 |
+
filter_config['type'] = 'date'
|
| 574 |
+
filter_config['start_date'] = start if start else None
|
| 575 |
+
filter_config['end_date'] = end if end else None
|
| 576 |
+
|
| 577 |
+
# Add to active filters
|
| 578 |
+
app_state.active_filters.append(filter_config)
|
| 579 |
+
|
| 580 |
+
# Apply all filters
|
| 581 |
+
filtered_df = app_state.apply_filters(app_state.active_filters)
|
| 582 |
+
|
| 583 |
+
# Generate status message
|
| 584 |
+
status = "**Active Filters:**\n"
|
| 585 |
+
for i, f in enumerate(app_state.active_filters, 1):
|
| 586 |
+
status += f"{i}. {f['column']} ({f['type']})\n"
|
| 587 |
+
|
| 588 |
+
row_info = f"{len(filtered_df)} rows (filtered from {len(app_state.current_df)})"
|
| 589 |
+
|
| 590 |
+
return status, row_info, filtered_df.head(100)
|
| 591 |
+
|
| 592 |
+
def clear_all_filters():
|
| 593 |
+
if app_state.current_df is None:
|
| 594 |
+
return "No filters applied", "0 rows", None
|
| 595 |
+
|
| 596 |
+
app_state.reset_filters()
|
| 597 |
+
row_info = f"{len(app_state.current_df)} rows"
|
| 598 |
+
|
| 599 |
+
return "No filters applied", row_info, app_state.current_df.head(100)
|
| 600 |
+
|
| 601 |
+
# Event handlers
|
| 602 |
+
filter_type.change(
|
| 603 |
+
fn=update_column_choices,
|
| 604 |
+
inputs=[filter_type],
|
| 605 |
+
outputs=[column_select, numerical_group, categorical_group, date_group]
|
| 606 |
+
)
|
| 607 |
+
|
| 608 |
+
column_select.change(
|
| 609 |
+
fn=update_category_choices,
|
| 610 |
+
inputs=[column_select],
|
| 611 |
+
outputs=[category_select]
|
| 612 |
+
)
|
| 613 |
+
|
| 614 |
+
add_filter_btn.click(
|
| 615 |
+
fn=add_filter,
|
| 616 |
+
inputs=[filter_type, column_select, min_value, max_value, category_select, start_date, end_date],
|
| 617 |
+
outputs=[filter_status, row_count, filtered_preview]
|
| 618 |
+
)
|
| 619 |
+
|
| 620 |
+
clear_filters_btn.click(
|
| 621 |
+
fn=clear_all_filters,
|
| 622 |
+
outputs=[filter_status, row_count, filtered_preview]
|
| 623 |
+
)
|
| 624 |
+
|
| 625 |
+
return (filter_type, column_select, filter_status, row_count, filtered_preview,
|
| 626 |
+
add_filter_btn, clear_filters_btn)
|
| 627 |
+
|
| 628 |
+
|
| 629 |
+
# ============================================================================
|
| 630 |
+
# TAB 4: VISUALIZATIONS
|
| 631 |
+
# ============================================================================
|
| 632 |
+
|
| 633 |
+
def create_visualization_tab():
|
| 634 |
+
"""Create visualization tab with smart recommendations."""
|
| 635 |
+
|
| 636 |
+
with gr.Tab("📉 Visualizations"):
|
| 637 |
+
gr.Markdown("## Create Visualizations")
|
| 638 |
+
|
| 639 |
+
# Smart Recommendations Section
|
| 640 |
+
with gr.Accordion("🎯 Smart Recommendations", open=True):
|
| 641 |
+
recommend_btn = gr.Button("💡 Get Visualization Recommendations", variant="primary")
|
| 642 |
+
recommendations_output = gr.Markdown(value="Click the button to get recommendations")
|
| 643 |
+
|
| 644 |
+
# Dynamic recommendation buttons
|
| 645 |
+
with gr.Row(visible=False) as rec_buttons_row:
|
| 646 |
+
rec_btn_1 = gr.Button("", visible=False, variant="secondary", scale=1)
|
| 647 |
+
rec_btn_2 = gr.Button("", visible=False, variant="secondary", scale=1)
|
| 648 |
+
rec_btn_3 = gr.Button("", visible=False, variant="secondary", scale=1)
|
| 649 |
+
|
| 650 |
+
rec_viz_output = gr.Plot(label="Recommended Visualization", visible=False)
|
| 651 |
+
rec_status = gr.Textbox(label="Status", visible=False, interactive=False)
|
| 652 |
+
|
| 653 |
+
def get_recommendations():
|
| 654 |
+
if app_state.filtered_df is None or app_state.filtered_df.empty:
|
| 655 |
+
return "⚠️ No data available. Please load a dataset first.", gr.Row(visible=False), "", "", "", gr.Plot(visible=False), gr.Textbox(visible=False)
|
| 656 |
+
|
| 657 |
+
recommender = SmartVisualizationRecommender()
|
| 658 |
+
analysis = recommender.analyze_dataset(app_state.filtered_df)
|
| 659 |
+
app_state.current_recommendations = analysis['recommendations']
|
| 660 |
+
|
| 661 |
+
output = "## 🎯 Recommended Visualizations\n\n"
|
| 662 |
+
output += analysis['summary'] + "\n\n"
|
| 663 |
+
|
| 664 |
+
output += "### Click below to create recommended visualizations:\n\n"
|
| 665 |
+
|
| 666 |
+
# Prepare button labels
|
| 667 |
+
btn_labels = ["", "", ""]
|
| 668 |
+
for i, rec in enumerate(analysis['recommendations'][:3]):
|
| 669 |
+
priority_emoji = "🔴" if rec['priority'] == 'high' else "🟡"
|
| 670 |
+
btn_labels[i] = f"{priority_emoji} Create {rec['type'].replace('_', ' ').title()}"
|
| 671 |
+
|
| 672 |
+
return (
|
| 673 |
+
output,
|
| 674 |
+
gr.Row(visible=True),
|
| 675 |
+
gr.Button(value=btn_labels[0], visible=True) if btn_labels[0] else gr.Button(visible=False),
|
| 676 |
+
gr.Button(value=btn_labels[1], visible=True) if btn_labels[1] else gr.Button(visible=False),
|
| 677 |
+
gr.Button(value=btn_labels[2], visible=True) if btn_labels[2] else gr.Button(visible=False),
|
| 678 |
+
gr.Plot(visible=False),
|
| 679 |
+
gr.Textbox(visible=False)
|
| 680 |
+
)
|
| 681 |
+
|
| 682 |
+
def create_recommended_viz(rec_index):
|
| 683 |
+
if app_state.current_recommendations is None or rec_index >= len(app_state.current_recommendations):
|
| 684 |
+
return None, "⚠️ No recommendation available"
|
| 685 |
+
|
| 686 |
+
rec = app_state.current_recommendations[rec_index]
|
| 687 |
+
|
| 688 |
+
try:
|
| 689 |
+
if rec['type'] == 'time_series':
|
| 690 |
+
params = rec['suggested_params']
|
| 691 |
+
fig = app_state.viz_manager.create_visualization(
|
| 692 |
+
'time_series',
|
| 693 |
+
app_state.filtered_df,
|
| 694 |
+
date_column=params['date_column'],
|
| 695 |
+
value_column=params['value_column'],
|
| 696 |
+
aggregation=params['aggregation'],
|
| 697 |
+
backend='matplotlib'
|
| 698 |
+
)
|
| 699 |
+
status = f"✅ Created recommended time series plot"
|
| 700 |
+
|
| 701 |
+
elif rec['type'] == 'correlation':
|
| 702 |
+
fig = app_state.viz_manager.create_visualization(
|
| 703 |
+
'correlation',
|
| 704 |
+
app_state.filtered_df,
|
| 705 |
+
backend='matplotlib'
|
| 706 |
+
)
|
| 707 |
+
status = "✅ Created recommended correlation heatmap"
|
| 708 |
+
|
| 709 |
+
elif rec['type'] == 'category':
|
| 710 |
+
params = rec['suggested_params']
|
| 711 |
+
fig = app_state.viz_manager.create_visualization(
|
| 712 |
+
'category',
|
| 713 |
+
app_state.filtered_df,
|
| 714 |
+
column=params['column'],
|
| 715 |
+
plot_type=params['plot_type'],
|
| 716 |
+
backend='matplotlib'
|
| 717 |
+
)
|
| 718 |
+
status = f"✅ Created recommended category plot"
|
| 719 |
+
|
| 720 |
+
elif rec['type'] == 'distribution':
|
| 721 |
+
params = rec['suggested_params']
|
| 722 |
+
fig = app_state.viz_manager.create_visualization(
|
| 723 |
+
'distribution',
|
| 724 |
+
app_state.filtered_df,
|
| 725 |
+
column=params['column'],
|
| 726 |
+
plot_type=params['plot_type'],
|
| 727 |
+
backend='matplotlib'
|
| 728 |
+
)
|
| 729 |
+
status = "✅ Created recommended distribution plot"
|
| 730 |
+
|
| 731 |
+
elif rec['type'] == 'scatter':
|
| 732 |
+
params = rec['suggested_params']
|
| 733 |
+
fig = app_state.viz_manager.create_visualization(
|
| 734 |
+
'scatter',
|
| 735 |
+
app_state.filtered_df,
|
| 736 |
+
x_column=params['x_column'],
|
| 737 |
+
y_column=params['y_column'],
|
| 738 |
+
backend='matplotlib'
|
| 739 |
+
)
|
| 740 |
+
status = "✅ Created recommended scatter plot"
|
| 741 |
+
else:
|
| 742 |
+
return None, "❌ Unknown recommendation type"
|
| 743 |
+
|
| 744 |
+
return gr.Plot(value=fig, visible=True), gr.Textbox(value=status, visible=True)
|
| 745 |
+
|
| 746 |
+
except Exception as e:
|
| 747 |
+
logger.error(f"Error creating recommended visualization: {e}")
|
| 748 |
+
return None, gr.Textbox(value=f"❌ Error: {str(e)}", visible=True)
|
| 749 |
+
|
| 750 |
+
recommend_btn.click(
|
| 751 |
+
fn=get_recommendations,
|
| 752 |
+
outputs=[recommendations_output, rec_buttons_row, rec_btn_1, rec_btn_2, rec_btn_3, rec_viz_output, rec_status]
|
| 753 |
+
)
|
| 754 |
+
|
| 755 |
+
rec_btn_1.click(
|
| 756 |
+
fn=lambda: create_recommended_viz(0),
|
| 757 |
+
outputs=[rec_viz_output, rec_status]
|
| 758 |
+
)
|
| 759 |
+
|
| 760 |
+
rec_btn_2.click(
|
| 761 |
+
fn=lambda: create_recommended_viz(1),
|
| 762 |
+
outputs=[rec_viz_output, rec_status]
|
| 763 |
+
)
|
| 764 |
+
|
| 765 |
+
rec_btn_3.click(
|
| 766 |
+
fn=lambda: create_recommended_viz(2),
|
| 767 |
+
outputs=[rec_viz_output, rec_status]
|
| 768 |
+
)
|
| 769 |
+
|
| 770 |
+
gr.Markdown("---")
|
| 771 |
+
gr.Markdown("### Create Custom Visualization")
|
| 772 |
+
|
| 773 |
+
with gr.Row():
|
| 774 |
+
with gr.Column(scale=1):
|
| 775 |
+
viz_type = gr.Dropdown(
|
| 776 |
+
label="Visualization Type",
|
| 777 |
+
choices=[
|
| 778 |
+
"Time Series",
|
| 779 |
+
"Distribution (Histogram)",
|
| 780 |
+
"Distribution (Box Plot)",
|
| 781 |
+
"Category (Bar Chart)",
|
| 782 |
+
"Category (Pie Chart)",
|
| 783 |
+
"Scatter Plot",
|
| 784 |
+
"Correlation Heatmap"
|
| 785 |
+
],
|
| 786 |
+
value="Time Series"
|
| 787 |
+
)
|
| 788 |
+
|
| 789 |
+
# Dynamic parameter inputs
|
| 790 |
+
with gr.Group() as time_series_group:
|
| 791 |
+
ts_date_col = gr.Dropdown(label="Date Column", choices=[])
|
| 792 |
+
ts_value_col = gr.Dropdown(label="Value Column", choices=[])
|
| 793 |
+
ts_agg = gr.Dropdown(
|
| 794 |
+
label="Aggregation",
|
| 795 |
+
choices=["sum", "mean", "count", "median"],
|
| 796 |
+
value="sum"
|
| 797 |
+
)
|
| 798 |
+
|
| 799 |
+
with gr.Group(visible=False) as distribution_group:
|
| 800 |
+
dist_col = gr.Dropdown(label="Column", choices=[])
|
| 801 |
+
dist_bins = gr.Slider(label="Number of Bins", minimum=10, maximum=100, value=30, step=5)
|
| 802 |
+
|
| 803 |
+
with gr.Group(visible=False) as category_group:
|
| 804 |
+
cat_col = gr.Dropdown(label="Category Column", choices=[])
|
| 805 |
+
cat_value_col = gr.Dropdown(label="Value Column (optional)", choices=[])
|
| 806 |
+
cat_agg = gr.Dropdown(
|
| 807 |
+
label="Aggregation",
|
| 808 |
+
choices=["count", "sum", "mean", "median"],
|
| 809 |
+
value="count"
|
| 810 |
+
)
|
| 811 |
+
cat_top_n = gr.Slider(label="Top N Categories", minimum=5, maximum=20, value=10, step=1)
|
| 812 |
+
|
| 813 |
+
with gr.Group(visible=False) as scatter_group:
|
| 814 |
+
scatter_x = gr.Dropdown(label="X Column", choices=[])
|
| 815 |
+
scatter_y = gr.Dropdown(label="Y Column", choices=[])
|
| 816 |
+
scatter_color = gr.Dropdown(label="Color by (optional)", choices=[])
|
| 817 |
+
scatter_trend = gr.Checkbox(label="Show Trend Line", value=False)
|
| 818 |
+
|
| 819 |
+
with gr.Group(visible=False) as correlation_group:
|
| 820 |
+
corr_method = gr.Dropdown(
|
| 821 |
+
label="Correlation Method",
|
| 822 |
+
choices=["pearson", "spearman", "kendall"],
|
| 823 |
+
value="pearson"
|
| 824 |
+
)
|
| 825 |
+
|
| 826 |
+
create_viz_btn = gr.Button("📊 Create Visualization", variant="primary")
|
| 827 |
+
|
| 828 |
+
with gr.Column(scale=2):
|
| 829 |
+
viz_output = gr.Plot(label="Visualization")
|
| 830 |
+
viz_status = gr.Textbox(label="Status", lines=2, interactive=False)
|
| 831 |
+
|
| 832 |
+
def update_viz_controls(viz_type_value):
|
| 833 |
+
if app_state.filtered_df is None:
|
| 834 |
+
return [gr.Group(visible=False)] * 5 + [gr.Dropdown(choices=[])] * 8
|
| 835 |
+
|
| 836 |
+
col_types = get_column_types(app_state.filtered_df)
|
| 837 |
+
|
| 838 |
+
# FIXED: Return format with value=None to force refresh
|
| 839 |
+
# [5 Groups] + [8 Dropdowns]
|
| 840 |
+
# Groups: time_series_group, distribution_group, category_group, scatter_group, correlation_group
|
| 841 |
+
# Dropdowns: ts_date_col, ts_value_col, dist_col, cat_col, cat_value_col, scatter_x, scatter_y, scatter_color
|
| 842 |
+
|
| 843 |
+
if viz_type_value == "Time Series":
|
| 844 |
+
return (
|
| 845 |
+
gr.Group(visible=True), # time_series_group
|
| 846 |
+
gr.Group(visible=False), # distribution_group
|
| 847 |
+
gr.Group(visible=False), # category_group
|
| 848 |
+
gr.Group(visible=False), # scatter_group
|
| 849 |
+
gr.Group(visible=False), # correlation_group
|
| 850 |
+
gr.Dropdown(choices=col_types['datetime'], value=None), # ts_date_col
|
| 851 |
+
gr.Dropdown(choices=col_types['numerical'], value=None), # ts_value_col
|
| 852 |
+
gr.Dropdown(choices=col_types['numerical'], value=None), # dist_col
|
| 853 |
+
gr.Dropdown(choices=col_types['categorical'], value=None), # cat_col
|
| 854 |
+
gr.Dropdown(choices=col_types['numerical'], value=None), # cat_value_col
|
| 855 |
+
gr.Dropdown(choices=col_types['numerical'], value=None), # scatter_x
|
| 856 |
+
gr.Dropdown(choices=col_types['numerical'], value=None), # scatter_y
|
| 857 |
+
gr.Dropdown(choices=col_types['categorical'] + col_types['numerical'], value=None) # scatter_color
|
| 858 |
+
)
|
| 859 |
+
|
| 860 |
+
elif "Distribution" in viz_type_value:
|
| 861 |
+
return (
|
| 862 |
+
gr.Group(visible=False),
|
| 863 |
+
gr.Group(visible=True),
|
| 864 |
+
gr.Group(visible=False),
|
| 865 |
+
gr.Group(visible=False),
|
| 866 |
+
gr.Group(visible=False),
|
| 867 |
+
gr.Dropdown(choices=col_types['datetime'], value=None),
|
| 868 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 869 |
+
gr.Dropdown(choices=col_types['numerical'], value=None), # dist_col - visible
|
| 870 |
+
gr.Dropdown(choices=col_types['categorical'], value=None),
|
| 871 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 872 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 873 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 874 |
+
gr.Dropdown(choices=col_types['categorical'] + col_types['numerical'], value=None)
|
| 875 |
+
)
|
| 876 |
+
|
| 877 |
+
elif "Category" in viz_type_value:
|
| 878 |
+
return (
|
| 879 |
+
gr.Group(visible=False),
|
| 880 |
+
gr.Group(visible=False),
|
| 881 |
+
gr.Group(visible=True),
|
| 882 |
+
gr.Group(visible=False),
|
| 883 |
+
gr.Group(visible=False),
|
| 884 |
+
gr.Dropdown(choices=col_types['datetime'], value=None),
|
| 885 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 886 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 887 |
+
gr.Dropdown(choices=col_types['categorical'], value=None), # cat_col - visible
|
| 888 |
+
gr.Dropdown(choices=col_types['numerical'], value=None), # cat_value_col - visible
|
| 889 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 890 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 891 |
+
gr.Dropdown(choices=col_types['categorical'] + col_types['numerical'], value=None)
|
| 892 |
+
)
|
| 893 |
+
|
| 894 |
+
elif viz_type_value == "Scatter Plot":
|
| 895 |
+
return (
|
| 896 |
+
gr.Group(visible=False),
|
| 897 |
+
gr.Group(visible=False),
|
| 898 |
+
gr.Group(visible=False),
|
| 899 |
+
gr.Group(visible=True),
|
| 900 |
+
gr.Group(visible=False),
|
| 901 |
+
gr.Dropdown(choices=col_types['datetime'], value=None),
|
| 902 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 903 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 904 |
+
gr.Dropdown(choices=col_types['categorical'], value=None),
|
| 905 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 906 |
+
gr.Dropdown(choices=col_types['numerical'], value=None), # scatter_x - visible
|
| 907 |
+
gr.Dropdown(choices=col_types['numerical'], value=None), # scatter_y - visible
|
| 908 |
+
gr.Dropdown(choices=col_types['categorical'] + col_types['numerical'], value=None) # scatter_color - visible
|
| 909 |
+
)
|
| 910 |
+
|
| 911 |
+
else: # Correlation Heatmap
|
| 912 |
+
return (
|
| 913 |
+
gr.Group(visible=False),
|
| 914 |
+
gr.Group(visible=False),
|
| 915 |
+
gr.Group(visible=False),
|
| 916 |
+
gr.Group(visible=False),
|
| 917 |
+
gr.Group(visible=True),
|
| 918 |
+
gr.Dropdown(choices=col_types['datetime'], value=None),
|
| 919 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 920 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 921 |
+
gr.Dropdown(choices=col_types['categorical'], value=None),
|
| 922 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 923 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 924 |
+
gr.Dropdown(choices=col_types['numerical'], value=None),
|
| 925 |
+
gr.Dropdown(choices=col_types['categorical'] + col_types['numerical'], value=None)
|
| 926 |
+
)
|
| 927 |
+
|
| 928 |
+
def create_visualization(viz_type_value, date_col, value_col, agg,
|
| 929 |
+
dist_column, bins, cat_column, cat_value, cat_aggregation, top_n,
|
| 930 |
+
x_col, y_col, color_col, trend, corr_method_value):
|
| 931 |
+
if app_state.filtered_df is None or app_state.filtered_df.empty:
|
| 932 |
+
return None, "⚠️ No data available"
|
| 933 |
+
|
| 934 |
+
try:
|
| 935 |
+
if viz_type_value == "Time Series":
|
| 936 |
+
if not date_col or not value_col:
|
| 937 |
+
return None, "⚠️ Please select date and value columns"
|
| 938 |
+
|
| 939 |
+
fig = app_state.viz_manager.create_visualization(
|
| 940 |
+
'time_series',
|
| 941 |
+
app_state.filtered_df,
|
| 942 |
+
date_column=date_col,
|
| 943 |
+
value_column=value_col,
|
| 944 |
+
aggregation=agg,
|
| 945 |
+
backend='matplotlib'
|
| 946 |
+
)
|
| 947 |
+
status = f"✅ Created time series plot: {value_col} over {date_col}"
|
| 948 |
+
|
| 949 |
+
elif "Distribution" in viz_type_value:
|
| 950 |
+
if not dist_column:
|
| 951 |
+
return None, "⚠️ Please select a column"
|
| 952 |
+
|
| 953 |
+
plot_type = 'histogram' if 'Histogram' in viz_type_value else 'box'
|
| 954 |
+
|
| 955 |
+
fig = app_state.viz_manager.create_visualization(
|
| 956 |
+
'distribution',
|
| 957 |
+
app_state.filtered_df,
|
| 958 |
+
column=dist_column,
|
| 959 |
+
plot_type=plot_type,
|
| 960 |
+
bins=int(bins),
|
| 961 |
+
backend='matplotlib'
|
| 962 |
+
)
|
| 963 |
+
status = f"✅ Created {plot_type} plot for {dist_column}"
|
| 964 |
+
|
| 965 |
+
elif "Category" in viz_type_value:
|
| 966 |
+
if not cat_column:
|
| 967 |
+
return None, "⚠️ Please select a category column"
|
| 968 |
+
|
| 969 |
+
plot_type = 'bar' if 'Bar' in viz_type_value else 'pie'
|
| 970 |
+
|
| 971 |
+
fig = app_state.viz_manager.create_visualization(
|
| 972 |
+
'category',
|
| 973 |
+
app_state.filtered_df,
|
| 974 |
+
column=cat_column,
|
| 975 |
+
value_column=cat_value if cat_value else None,
|
| 976 |
+
plot_type=plot_type,
|
| 977 |
+
aggregation=cat_aggregation,
|
| 978 |
+
top_n=int(top_n),
|
| 979 |
+
backend='matplotlib'
|
| 980 |
+
)
|
| 981 |
+
status = f"✅ Created {plot_type} chart for {cat_column}"
|
| 982 |
+
|
| 983 |
+
elif viz_type_value == "Scatter Plot":
|
| 984 |
+
if not x_col or not y_col:
|
| 985 |
+
return None, "⚠️ Please select X and Y columns"
|
| 986 |
+
|
| 987 |
+
fig = app_state.viz_manager.create_visualization(
|
| 988 |
+
'scatter',
|
| 989 |
+
app_state.filtered_df,
|
| 990 |
+
x_column=x_col,
|
| 991 |
+
y_column=y_col,
|
| 992 |
+
color_column=color_col if color_col else None,
|
| 993 |
+
show_trend=trend,
|
| 994 |
+
backend='matplotlib'
|
| 995 |
+
)
|
| 996 |
+
status = f"✅ Created scatter plot: {y_col} vs {x_col}"
|
| 997 |
+
|
| 998 |
+
else: # Correlation Heatmap
|
| 999 |
+
fig = app_state.viz_manager.create_visualization(
|
| 1000 |
+
'correlation',
|
| 1001 |
+
app_state.filtered_df,
|
| 1002 |
+
method=corr_method_value,
|
| 1003 |
+
backend='matplotlib'
|
| 1004 |
+
)
|
| 1005 |
+
status = "✅ Created correlation heatmap"
|
| 1006 |
+
|
| 1007 |
+
return fig, status
|
| 1008 |
+
|
| 1009 |
+
except Exception as e:
|
| 1010 |
+
logger.error(f"Error creating visualization: {e}")
|
| 1011 |
+
import traceback
|
| 1012 |
+
traceback.print_exc()
|
| 1013 |
+
return None, f"❌ Error: {str(e)}"
|
| 1014 |
+
|
| 1015 |
+
viz_type.change(
|
| 1016 |
+
fn=update_viz_controls,
|
| 1017 |
+
inputs=[viz_type],
|
| 1018 |
+
outputs=[
|
| 1019 |
+
time_series_group, distribution_group, category_group,
|
| 1020 |
+
scatter_group, correlation_group,
|
| 1021 |
+
ts_date_col, ts_value_col, dist_col, cat_col, cat_value_col,
|
| 1022 |
+
scatter_x, scatter_y, scatter_color
|
| 1023 |
+
]
|
| 1024 |
+
)
|
| 1025 |
+
|
| 1026 |
+
create_viz_btn.click(
|
| 1027 |
+
fn=create_visualization,
|
| 1028 |
+
inputs=[
|
| 1029 |
+
viz_type, ts_date_col, ts_value_col, ts_agg,
|
| 1030 |
+
dist_col, dist_bins, cat_col, cat_value_col, cat_agg, cat_top_n,
|
| 1031 |
+
scatter_x, scatter_y, scatter_color, scatter_trend, corr_method
|
| 1032 |
+
],
|
| 1033 |
+
outputs=[viz_output, viz_status]
|
| 1034 |
+
)
|
| 1035 |
+
|
| 1036 |
+
return (viz_type, recommend_btn, recommendations_output, rec_buttons_row,
|
| 1037 |
+
rec_btn_1, rec_btn_2, rec_btn_3, rec_viz_output, rec_status,
|
| 1038 |
+
viz_output, viz_status, create_viz_btn)
|
| 1039 |
+
|
| 1040 |
+
|
| 1041 |
+
# ============================================================================
|
| 1042 |
+
# TAB 5: INSIGHTS
|
| 1043 |
+
# ============================================================================
|
| 1044 |
+
|
| 1045 |
+
def create_insights_tab():
|
| 1046 |
+
"""Create automated insights tab."""
|
| 1047 |
+
|
| 1048 |
+
with gr.Tab("💡 Insights"):
|
| 1049 |
+
gr.Markdown("## Automated Insights")
|
| 1050 |
+
gr.Markdown("Generate intelligent insights from your data automatically")
|
| 1051 |
+
|
| 1052 |
+
with gr.Row():
|
| 1053 |
+
generate_all_btn = gr.Button("🚀 Generate All Insights", variant="primary", scale=2)
|
| 1054 |
+
generate_custom_btn = gr.Button("⚙️ Generate Custom Insight", variant="secondary", scale=1)
|
| 1055 |
+
|
| 1056 |
+
with gr.Row():
|
| 1057 |
+
with gr.Column(scale=1):
|
| 1058 |
+
gr.Markdown("### Custom Insight Options")
|
| 1059 |
+
|
| 1060 |
+
insight_type = gr.Dropdown(
|
| 1061 |
+
label="Insight Type",
|
| 1062 |
+
choices=[
|
| 1063 |
+
"Top/Bottom Performers",
|
| 1064 |
+
"Trend Analysis",
|
| 1065 |
+
"Anomaly Detection",
|
| 1066 |
+
"Distribution Analysis",
|
| 1067 |
+
"Correlation Analysis"
|
| 1068 |
+
],
|
| 1069 |
+
value="Top/Bottom Performers"
|
| 1070 |
+
)
|
| 1071 |
+
|
| 1072 |
+
insight_column = gr.Dropdown(label="Select Column", choices=[])
|
| 1073 |
+
insight_column2 = gr.Dropdown(label="Second Column (for trends)", choices=[], visible=False)
|
| 1074 |
+
|
| 1075 |
+
with gr.Column(scale=2):
|
| 1076 |
+
insights_output = gr.Textbox(
|
| 1077 |
+
label="Insights Report",
|
| 1078 |
+
lines=20,
|
| 1079 |
+
interactive=False
|
| 1080 |
+
)
|
| 1081 |
+
|
| 1082 |
+
def update_insight_columns(insight_type_value):
|
| 1083 |
+
if app_state.filtered_df is None:
|
| 1084 |
+
return gr.Dropdown(choices=[]), gr.Dropdown(choices=[], visible=False)
|
| 1085 |
+
|
| 1086 |
+
col_types = get_column_types(app_state.filtered_df)
|
| 1087 |
+
|
| 1088 |
+
if insight_type_value == "Trend Analysis":
|
| 1089 |
+
return (
|
| 1090 |
+
gr.Dropdown(choices=col_types['datetime']),
|
| 1091 |
+
gr.Dropdown(choices=col_types['numerical'], visible=True)
|
| 1092 |
+
)
|
| 1093 |
+
else:
|
| 1094 |
+
all_cols = col_types['numerical'] + col_types['categorical']
|
| 1095 |
+
return (
|
| 1096 |
+
gr.Dropdown(choices=all_cols),
|
| 1097 |
+
gr.Dropdown(choices=[], visible=False)
|
| 1098 |
+
)
|
| 1099 |
+
|
| 1100 |
+
def generate_all_insights():
|
| 1101 |
+
if app_state.filtered_df is None or app_state.filtered_df.empty:
|
| 1102 |
+
return "⚠️ No data available. Please load a dataset first."
|
| 1103 |
+
|
| 1104 |
+
try:
|
| 1105 |
+
insights = app_state.insight_manager.generate_all_insights(app_state.filtered_df)
|
| 1106 |
+
report = app_state.insight_manager.format_insight_report(insights)
|
| 1107 |
+
return report
|
| 1108 |
+
except Exception as e:
|
| 1109 |
+
logger.error(f"Error generating insights: {e}")
|
| 1110 |
+
return f"❌ Error generating insights: {str(e)}"
|
| 1111 |
+
|
| 1112 |
+
def generate_custom_insight(insight_type_value, col1, col2):
|
| 1113 |
+
if app_state.filtered_df is None or app_state.filtered_df.empty:
|
| 1114 |
+
return "⚠️ No data available"
|
| 1115 |
+
|
| 1116 |
+
if not col1:
|
| 1117 |
+
return "⚠️ Please select a column"
|
| 1118 |
+
|
| 1119 |
+
try:
|
| 1120 |
+
if insight_type_value == "Top/Bottom Performers":
|
| 1121 |
+
insight = app_state.insight_manager.generate_insight(
|
| 1122 |
+
'top_bottom',
|
| 1123 |
+
app_state.filtered_df,
|
| 1124 |
+
column=col1
|
| 1125 |
+
)
|
| 1126 |
+
|
| 1127 |
+
elif insight_type_value == "Trend Analysis":
|
| 1128 |
+
if not col2:
|
| 1129 |
+
return "⚠️ Please select both date and value columns"
|
| 1130 |
+
|
| 1131 |
+
insight = app_state.insight_manager.generate_insight(
|
| 1132 |
+
'trend',
|
| 1133 |
+
app_state.filtered_df,
|
| 1134 |
+
date_column=col1,
|
| 1135 |
+
value_column=col2
|
| 1136 |
+
)
|
| 1137 |
+
|
| 1138 |
+
elif insight_type_value == "Anomaly Detection":
|
| 1139 |
+
insight = app_state.insight_manager.generate_insight(
|
| 1140 |
+
'anomaly',
|
| 1141 |
+
app_state.filtered_df,
|
| 1142 |
+
column=col1
|
| 1143 |
+
)
|
| 1144 |
+
|
| 1145 |
+
elif insight_type_value == "Distribution Analysis":
|
| 1146 |
+
insight = app_state.insight_manager.generate_insight(
|
| 1147 |
+
'distribution',
|
| 1148 |
+
app_state.filtered_df,
|
| 1149 |
+
column=col1
|
| 1150 |
+
)
|
| 1151 |
+
|
| 1152 |
+
else: # Correlation Analysis
|
| 1153 |
+
insight = app_state.insight_manager.generate_insight(
|
| 1154 |
+
'correlation',
|
| 1155 |
+
app_state.filtered_df
|
| 1156 |
+
)
|
| 1157 |
+
|
| 1158 |
+
# Format single insight
|
| 1159 |
+
report = f"## {insight_type_value}\n\n"
|
| 1160 |
+
report += f"**Summary**: {insight.get('summary', 'No summary available')}\n\n"
|
| 1161 |
+
|
| 1162 |
+
return report
|
| 1163 |
+
|
| 1164 |
+
except Exception as e:
|
| 1165 |
+
logger.error(f"Error generating custom insight: {e}")
|
| 1166 |
+
return f"❌ Error: {str(e)}"
|
| 1167 |
+
|
| 1168 |
+
insight_type.change(
|
| 1169 |
+
fn=update_insight_columns,
|
| 1170 |
+
inputs=[insight_type],
|
| 1171 |
+
outputs=[insight_column, insight_column2]
|
| 1172 |
+
)
|
| 1173 |
+
|
| 1174 |
+
generate_all_btn.click(
|
| 1175 |
+
fn=generate_all_insights,
|
| 1176 |
+
outputs=[insights_output]
|
| 1177 |
+
)
|
| 1178 |
+
|
| 1179 |
+
generate_custom_btn.click(
|
| 1180 |
+
fn=generate_custom_insight,
|
| 1181 |
+
inputs=[insight_type, insight_column, insight_column2],
|
| 1182 |
+
outputs=[insights_output]
|
| 1183 |
+
)
|
| 1184 |
+
|
| 1185 |
+
return generate_all_btn, insight_type, insights_output
|
| 1186 |
+
|
| 1187 |
+
|
| 1188 |
+
# ============================================================================
|
| 1189 |
+
# TAB 6: EXPORT
|
| 1190 |
+
# ============================================================================
|
| 1191 |
+
|
| 1192 |
+
def create_export_tab():
|
| 1193 |
+
"""Create data export tab."""
|
| 1194 |
+
|
| 1195 |
+
with gr.Tab("💾 Export"):
|
| 1196 |
+
gr.Markdown("## Export Data & Visualizations")
|
| 1197 |
+
|
| 1198 |
+
with gr.Row():
|
| 1199 |
+
with gr.Column():
|
| 1200 |
+
gr.Markdown("### Export Filtered Data")
|
| 1201 |
+
export_format = gr.Radio(
|
| 1202 |
+
choices=["CSV", "Excel"],
|
| 1203 |
+
label="Export Format",
|
| 1204 |
+
value="CSV"
|
| 1205 |
+
)
|
| 1206 |
+
|
| 1207 |
+
export_data_btn = gr.Button("📥 Export Data", variant="primary")
|
| 1208 |
+
export_file = gr.File(label="Download File")
|
| 1209 |
+
export_status = gr.Textbox(label="Status", lines=2, interactive=False)
|
| 1210 |
+
|
| 1211 |
+
with gr.Column():
|
| 1212 |
+
gr.Markdown("### Export Instructions")
|
| 1213 |
+
gr.Markdown("""
|
| 1214 |
+
**Export Your Data:**
|
| 1215 |
+
1. Apply any filters you want in the Filter tab
|
| 1216 |
+
2. Select your preferred export format
|
| 1217 |
+
3. Click 'Export Data' to download
|
| 1218 |
+
|
| 1219 |
+
**Note:** The export will include only the filtered data.
|
| 1220 |
+
""")
|
| 1221 |
+
|
| 1222 |
+
def export_data(format_choice):
|
| 1223 |
+
if app_state.filtered_df is None or app_state.filtered_df.empty:
|
| 1224 |
+
return None, "⚠️ No data to export"
|
| 1225 |
+
|
| 1226 |
+
try:
|
| 1227 |
+
import tempfile
|
| 1228 |
+
|
| 1229 |
+
if format_choice == "CSV":
|
| 1230 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv')
|
| 1231 |
+
exporter = CSVExporter()
|
| 1232 |
+
exporter.export(app_state.filtered_df, temp_file.name)
|
| 1233 |
+
status = f"✅ Exported {len(app_state.filtered_df)} rows to CSV"
|
| 1234 |
+
else: # Excel
|
| 1235 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
|
| 1236 |
+
exporter = ExcelExporter()
|
| 1237 |
+
exporter.export(app_state.filtered_df, temp_file.name)
|
| 1238 |
+
status = f"✅ Exported {len(app_state.filtered_df)} rows to Excel"
|
| 1239 |
+
|
| 1240 |
+
return temp_file.name, status
|
| 1241 |
+
|
| 1242 |
+
except Exception as e:
|
| 1243 |
+
logger.error(f"Error exporting data: {e}")
|
| 1244 |
+
return None, f"❌ Error: {str(e)}"
|
| 1245 |
+
|
| 1246 |
+
export_data_btn.click(
|
| 1247 |
+
fn=export_data,
|
| 1248 |
+
inputs=[export_format],
|
| 1249 |
+
outputs=[export_file, export_status]
|
| 1250 |
+
)
|
| 1251 |
+
|
| 1252 |
+
return export_data_btn, export_file, export_status
|
| 1253 |
+
|
| 1254 |
+
|
| 1255 |
+
# ============================================================================
|
| 1256 |
+
# MAIN APPLICATION
|
| 1257 |
+
# ============================================================================
|
| 1258 |
+
|
| 1259 |
+
def create_dashboard():
|
| 1260 |
+
"""Create the main Business Intelligence Dashboard."""
|
| 1261 |
+
|
| 1262 |
+
with gr.Blocks(title="Business Intelligence Dashboard") as demo:
|
| 1263 |
+
|
| 1264 |
+
# Header
|
| 1265 |
+
gr.Markdown("""
|
| 1266 |
+
# 📊 Business Intelligence Dashboard
|
| 1267 |
+
### Explore, Analyze, and Extract Insights from Your Data
|
| 1268 |
+
|
| 1269 |
+
**Features:** Smart Visualizations | Automated Insights | Interactive Filtering | Data Export
|
| 1270 |
+
""")
|
| 1271 |
+
|
| 1272 |
+
# Create all tabs and capture their components
|
| 1273 |
+
with gr.Tabs():
|
| 1274 |
+
# Tab 1: Dataset Selection
|
| 1275 |
+
(dataset_dropdown, status_box, dataset_info, data_preview,
|
| 1276 |
+
load_btn, upload_btn) = create_dataset_tab()
|
| 1277 |
+
|
| 1278 |
+
# Tab 2: Statistics
|
| 1279 |
+
(profile_btn, missing_values, numerical_summary,
|
| 1280 |
+
categorical_summary, correlation_plot) = create_statistics_tab()
|
| 1281 |
+
|
| 1282 |
+
# Tab 3: Filter
|
| 1283 |
+
(filter_type, column_select, filter_status, row_count,
|
| 1284 |
+
filtered_preview, add_filter_btn, clear_filters_btn) = create_filter_tab()
|
| 1285 |
+
|
| 1286 |
+
# Tab 4: Visualizations
|
| 1287 |
+
(viz_type, recommend_btn, recommendations_output, rec_buttons_row,
|
| 1288 |
+
rec_btn_1, rec_btn_2, rec_btn_3, rec_viz_output, rec_status,
|
| 1289 |
+
viz_output, viz_status, create_viz_btn) = create_visualization_tab()
|
| 1290 |
+
|
| 1291 |
+
# Tab 5: Insights
|
| 1292 |
+
(generate_all_btn, insight_type, insights_output) = create_insights_tab()
|
| 1293 |
+
|
| 1294 |
+
# Tab 6: Export
|
| 1295 |
+
(export_btn, export_file, export_status_export) = create_export_tab()
|
| 1296 |
+
|
| 1297 |
+
# Footer
|
| 1298 |
+
gr.Markdown("""
|
| 1299 |
+
---
|
| 1300 |
+
**Business Intelligence Dashboard** | Built with Gradio, Pandas, Matplotlib, and Plotly
|
| 1301 |
+
|
| 1302 |
+
*Tip: Start by loading a dataset from the Dataset Selection tab!*
|
| 1303 |
+
""")
|
| 1304 |
+
|
| 1305 |
+
# Connect load button to reset all tabs
|
| 1306 |
+
def load_and_reset(dataset_name):
|
| 1307 |
+
# Load dataset
|
| 1308 |
+
if not dataset_name:
|
| 1309 |
+
return (
|
| 1310 |
+
None, "⚠️ Please select a dataset", "", None,
|
| 1311 |
+
None, None, "", None,
|
| 1312 |
+
"No filters applied", "0 rows", None,
|
| 1313 |
+
"Click the button to get recommendations",
|
| 1314 |
+
None, None,
|
| 1315 |
+
""
|
| 1316 |
+
)
|
| 1317 |
+
|
| 1318 |
+
df, status = app_state.load_dataset(dataset_name)
|
| 1319 |
+
|
| 1320 |
+
if df is not None:
|
| 1321 |
+
info = f"📊 **Dataset**: {dataset_name}\n"
|
| 1322 |
+
info += f"📏 **Shape**: {df.shape[0]} rows × {df.shape[1]} columns\n"
|
| 1323 |
+
info += f"💾 **Memory**: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n"
|
| 1324 |
+
info += f"**Column Types**:\n"
|
| 1325 |
+
|
| 1326 |
+
col_types = get_column_types(df)
|
| 1327 |
+
info += f"- Numerical: {len(col_types['numerical'])}\n"
|
| 1328 |
+
info += f"- Categorical: {len(col_types['categorical'])}\n"
|
| 1329 |
+
info += f"- DateTime: {len(col_types['datetime'])}\n"
|
| 1330 |
+
|
| 1331 |
+
preview = df.head(100)
|
| 1332 |
+
|
| 1333 |
+
return (
|
| 1334 |
+
dataset_name, status, info, preview,
|
| 1335 |
+
None, None, "", None,
|
| 1336 |
+
"No filters applied", "0 rows", None,
|
| 1337 |
+
"Click the button to get recommendations",
|
| 1338 |
+
None, None,
|
| 1339 |
+
""
|
| 1340 |
+
)
|
| 1341 |
+
|
| 1342 |
+
return (
|
| 1343 |
+
None, status, "", None,
|
| 1344 |
+
None, None, "", None,
|
| 1345 |
+
"No filters applied", "0 rows", None,
|
| 1346 |
+
"Click the button to get recommendations",
|
| 1347 |
+
None, None,
|
| 1348 |
+
""
|
| 1349 |
+
)
|
| 1350 |
+
|
| 1351 |
+
load_btn.click(
|
| 1352 |
+
fn=load_and_reset,
|
| 1353 |
+
inputs=[dataset_dropdown],
|
| 1354 |
+
outputs=[
|
| 1355 |
+
dataset_dropdown, status_box, dataset_info, data_preview,
|
| 1356 |
+
missing_values, numerical_summary, categorical_summary, correlation_plot,
|
| 1357 |
+
filter_status, row_count, filtered_preview,
|
| 1358 |
+
recommendations_output,
|
| 1359 |
+
viz_output, viz_status,
|
| 1360 |
+
insights_output
|
| 1361 |
+
]
|
| 1362 |
+
)
|
| 1363 |
+
|
| 1364 |
+
return demo
|
| 1365 |
+
|
| 1366 |
+
|
| 1367 |
+
# ============================================================================
|
| 1368 |
+
# LAUNCH APPLICATION
|
| 1369 |
+
# ============================================================================
|
| 1370 |
+
|
| 1371 |
+
if __name__ == "__main__":
|
| 1372 |
+
logger.info("Starting Business Intelligence Dashboard...")
|
| 1373 |
+
|
| 1374 |
+
# Create and launch dashboard
|
| 1375 |
+
demo = create_dashboard()
|
| 1376 |
+
demo.launch(
|
| 1377 |
+
share=False,
|
| 1378 |
+
show_error=True
|
| 1379 |
+
)
|
data/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
data/Airbnb.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ecb59a7598d2aaf7dc2ed00c724648a319d93a916a3d4767e2bed0dbe0f1a7f8
|
| 3 |
+
size 35913454
|
data/Online_Retail.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43465a06f2ccf7c8b5bd2892bc7defb52f97487934fe93b16ae4c3936424676d
|
| 3 |
+
size 23715344
|
data_processor.py
ADDED
|
@@ -0,0 +1,819 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Processor Module for Business Intelligence Dashboard
|
| 3 |
+
|
| 4 |
+
This module handles all data loading, cleaning, validation, and filtering operations.
|
| 5 |
+
Implements SOLID principles with Strategy Pattern for flexible data processing.
|
| 6 |
+
|
| 7 |
+
Author: Craig
|
| 8 |
+
Date: December 2024
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import numpy as np
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Union, List, Dict, Optional, Any, Tuple
|
| 15 |
+
from abc import ABC, abstractmethod
|
| 16 |
+
import logging
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
|
| 19 |
+
from utils import (
|
| 20 |
+
FileValidator, DataFrameValidator, ColumnValidator,
|
| 21 |
+
get_column_types, detect_date_columns, clean_currency_column,
|
| 22 |
+
Config
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# Configure logging
|
| 26 |
+
logging.basicConfig(level=logging.INFO)
|
| 27 |
+
logger = logging.getLogger(__name__)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# ============================================================================
|
| 31 |
+
# STRATEGY PATTERN - Data Loading Strategies
|
| 32 |
+
# Follows Open/Closed Principle and Strategy Pattern
|
| 33 |
+
# ============================================================================
|
| 34 |
+
|
| 35 |
+
class DataLoadStrategy(ABC):
|
| 36 |
+
"""
|
| 37 |
+
Abstract base class for data loading strategies.
|
| 38 |
+
Follows Strategy Pattern - allows different loading algorithms to be selected at runtime.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
@abstractmethod
|
| 42 |
+
def load(self, filepath: Union[str, Path]) -> pd.DataFrame:
|
| 43 |
+
"""
|
| 44 |
+
Load data from file.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
filepath: Path to the data file
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
pd.DataFrame: Loaded data
|
| 51 |
+
"""
|
| 52 |
+
pass
|
| 53 |
+
|
| 54 |
+
@abstractmethod
|
| 55 |
+
def can_handle(self, filepath: Union[str, Path]) -> bool:
|
| 56 |
+
"""
|
| 57 |
+
Check if this strategy can handle the given file.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
filepath: Path to check
|
| 61 |
+
|
| 62 |
+
Returns:
|
| 63 |
+
bool: True if this strategy can handle the file
|
| 64 |
+
"""
|
| 65 |
+
pass
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class CSVLoadStrategy(DataLoadStrategy):
|
| 69 |
+
"""
|
| 70 |
+
Strategy for loading CSV files.
|
| 71 |
+
Follows Single Responsibility Principle - only handles CSV loading.
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
def can_handle(self, filepath: Union[str, Path]) -> bool:
|
| 75 |
+
"""Check if file is CSV format."""
|
| 76 |
+
return str(filepath).lower().endswith('.csv')
|
| 77 |
+
|
| 78 |
+
def load(self, filepath: Union[str, Path]) -> pd.DataFrame:
|
| 79 |
+
"""
|
| 80 |
+
Load CSV file with automatic encoding detection.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
filepath: Path to CSV file
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
pd.DataFrame: Loaded data
|
| 87 |
+
|
| 88 |
+
Raises:
|
| 89 |
+
Exception: If loading fails
|
| 90 |
+
"""
|
| 91 |
+
try:
|
| 92 |
+
# Try UTF-8 first
|
| 93 |
+
df = pd.read_csv(filepath, encoding='utf-8')
|
| 94 |
+
logger.info(f"Successfully loaded CSV file: {filepath}")
|
| 95 |
+
return df
|
| 96 |
+
except UnicodeDecodeError:
|
| 97 |
+
try:
|
| 98 |
+
# Fallback to latin-1
|
| 99 |
+
df = pd.read_csv(filepath, encoding='latin-1')
|
| 100 |
+
logger.info(f"Successfully loaded CSV file with latin-1 encoding: {filepath}")
|
| 101 |
+
return df
|
| 102 |
+
except Exception as e:
|
| 103 |
+
logger.error(f"Error loading CSV file: {e}")
|
| 104 |
+
raise Exception(f"Failed to load CSV file: {str(e)}")
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
class ExcelLoadStrategy(DataLoadStrategy):
|
| 108 |
+
"""
|
| 109 |
+
Strategy for loading Excel files.
|
| 110 |
+
Follows Single Responsibility Principle - only handles Excel loading.
|
| 111 |
+
"""
|
| 112 |
+
|
| 113 |
+
def can_handle(self, filepath: Union[str, Path]) -> bool:
|
| 114 |
+
"""Check if file is Excel format."""
|
| 115 |
+
extension = str(filepath).lower()
|
| 116 |
+
return extension.endswith('.xlsx') or extension.endswith('.xls')
|
| 117 |
+
|
| 118 |
+
def load(self, filepath: Union[str, Path]) -> pd.DataFrame:
|
| 119 |
+
"""
|
| 120 |
+
Load Excel file.
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
filepath: Path to Excel file
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
pd.DataFrame: Loaded data
|
| 127 |
+
|
| 128 |
+
Raises:
|
| 129 |
+
Exception: If loading fails
|
| 130 |
+
"""
|
| 131 |
+
try:
|
| 132 |
+
df = pd.read_excel(filepath, engine='openpyxl')
|
| 133 |
+
logger.info(f"Successfully loaded Excel file: {filepath}")
|
| 134 |
+
return df
|
| 135 |
+
except Exception as e:
|
| 136 |
+
logger.error(f"Error loading Excel file: {e}")
|
| 137 |
+
raise Exception(f"Failed to load Excel file: {str(e)}")
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
class JSONLoadStrategy(DataLoadStrategy):
|
| 141 |
+
"""
|
| 142 |
+
Strategy for loading JSON files.
|
| 143 |
+
Follows Single Responsibility Principle - only handles JSON loading.
|
| 144 |
+
"""
|
| 145 |
+
|
| 146 |
+
def can_handle(self, filepath: Union[str, Path]) -> bool:
|
| 147 |
+
"""Check if file is JSON format."""
|
| 148 |
+
return str(filepath).lower().endswith('.json')
|
| 149 |
+
|
| 150 |
+
def load(self, filepath: Union[str, Path]) -> pd.DataFrame:
|
| 151 |
+
"""
|
| 152 |
+
Load JSON file.
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
filepath: Path to JSON file
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
pd.DataFrame: Loaded data
|
| 159 |
+
|
| 160 |
+
Raises:
|
| 161 |
+
Exception: If loading fails
|
| 162 |
+
"""
|
| 163 |
+
try:
|
| 164 |
+
df = pd.read_json(filepath)
|
| 165 |
+
logger.info(f"Successfully loaded JSON file: {filepath}")
|
| 166 |
+
return df
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.error(f"Error loading JSON file: {e}")
|
| 169 |
+
raise Exception(f"Failed to load JSON file: {str(e)}")
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
class ParquetLoadStrategy(DataLoadStrategy):
|
| 173 |
+
"""
|
| 174 |
+
Strategy for loading Parquet files.
|
| 175 |
+
Follows Single Responsibility Principle - only handles Parquet loading.
|
| 176 |
+
"""
|
| 177 |
+
|
| 178 |
+
def can_handle(self, filepath: Union[str, Path]) -> bool:
|
| 179 |
+
"""Check if file is Parquet format."""
|
| 180 |
+
return str(filepath).lower().endswith('.parquet')
|
| 181 |
+
|
| 182 |
+
def load(self, filepath: Union[str, Path]) -> pd.DataFrame:
|
| 183 |
+
"""
|
| 184 |
+
Load Parquet file.
|
| 185 |
+
|
| 186 |
+
Args:
|
| 187 |
+
filepath: Path to Parquet file
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
pd.DataFrame: Loaded data
|
| 191 |
+
|
| 192 |
+
Raises:
|
| 193 |
+
Exception: If loading fails
|
| 194 |
+
"""
|
| 195 |
+
try:
|
| 196 |
+
df = pd.read_parquet(filepath)
|
| 197 |
+
logger.info(f"Successfully loaded Parquet file: {filepath}")
|
| 198 |
+
return df
|
| 199 |
+
except Exception as e:
|
| 200 |
+
logger.error(f"Error loading Parquet file: {e}")
|
| 201 |
+
raise Exception(f"Failed to load Parquet file: {str(e)}")
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
# ============================================================================
|
| 205 |
+
# DATA LOADER CONTEXT
|
| 206 |
+
# Uses Strategy Pattern to select appropriate loading strategy
|
| 207 |
+
# ============================================================================
|
| 208 |
+
|
| 209 |
+
class DataLoader:
|
| 210 |
+
"""
|
| 211 |
+
Context class for data loading using Strategy Pattern.
|
| 212 |
+
Automatically selects the appropriate loading strategy based on file type.
|
| 213 |
+
Follows Open/Closed Principle - open for extension (new strategies), closed for modification.
|
| 214 |
+
"""
|
| 215 |
+
|
| 216 |
+
def __init__(self):
|
| 217 |
+
"""Initialize DataLoader with all available strategies."""
|
| 218 |
+
self.strategies: List[DataLoadStrategy] = [
|
| 219 |
+
CSVLoadStrategy(),
|
| 220 |
+
ExcelLoadStrategy(),
|
| 221 |
+
JSONLoadStrategy(),
|
| 222 |
+
ParquetLoadStrategy()
|
| 223 |
+
]
|
| 224 |
+
self.file_validator = FileValidator()
|
| 225 |
+
|
| 226 |
+
def load_data(self, filepath: Union[str, Path]) -> pd.DataFrame:
|
| 227 |
+
"""
|
| 228 |
+
Load data using appropriate strategy.
|
| 229 |
+
|
| 230 |
+
Args:
|
| 231 |
+
filepath: Path to data file
|
| 232 |
+
|
| 233 |
+
Returns:
|
| 234 |
+
pd.DataFrame: Loaded data
|
| 235 |
+
|
| 236 |
+
Raises:
|
| 237 |
+
FileNotFoundError: If file doesn't exist
|
| 238 |
+
ValueError: If file format is not supported
|
| 239 |
+
Exception: If loading fails
|
| 240 |
+
"""
|
| 241 |
+
# Validate file
|
| 242 |
+
self.file_validator.validate(filepath)
|
| 243 |
+
|
| 244 |
+
# Find appropriate strategy
|
| 245 |
+
for strategy in self.strategies:
|
| 246 |
+
if strategy.can_handle(filepath):
|
| 247 |
+
df = strategy.load(filepath)
|
| 248 |
+
logger.info(f"Loaded {len(df)} rows and {len(df.columns)} columns")
|
| 249 |
+
return df
|
| 250 |
+
|
| 251 |
+
# No strategy found
|
| 252 |
+
raise ValueError(f"No loading strategy available for file: {filepath}")
|
| 253 |
+
|
| 254 |
+
def add_strategy(self, strategy: DataLoadStrategy) -> None:
|
| 255 |
+
"""
|
| 256 |
+
Add a new loading strategy.
|
| 257 |
+
Follows Open/Closed Principle - extend functionality without modifying existing code.
|
| 258 |
+
|
| 259 |
+
Args:
|
| 260 |
+
strategy: New loading strategy to add
|
| 261 |
+
"""
|
| 262 |
+
self.strategies.append(strategy)
|
| 263 |
+
logger.info(f"Added new loading strategy: {strategy.__class__.__name__}")
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
# ============================================================================
|
| 267 |
+
# DATA CLEANING
|
| 268 |
+
# Follows Single Responsibility Principle
|
| 269 |
+
# ============================================================================
|
| 270 |
+
|
| 271 |
+
class DataCleaner:
|
| 272 |
+
"""
|
| 273 |
+
Handles data cleaning operations.
|
| 274 |
+
Follows Single Responsibility Principle - only responsible for cleaning data.
|
| 275 |
+
"""
|
| 276 |
+
|
| 277 |
+
@staticmethod
|
| 278 |
+
def handle_missing_values(df: pd.DataFrame, strategy: str = 'none') -> pd.DataFrame:
|
| 279 |
+
"""
|
| 280 |
+
Handle missing values in DataFrame.
|
| 281 |
+
|
| 282 |
+
Args:
|
| 283 |
+
df: DataFrame to clean
|
| 284 |
+
strategy: Strategy for handling missing values
|
| 285 |
+
'none' - do nothing
|
| 286 |
+
'drop' - drop rows with any missing values
|
| 287 |
+
'fill_mean' - fill numerical columns with mean
|
| 288 |
+
'fill_median' - fill numerical columns with median
|
| 289 |
+
'fill_mode' - fill categorical columns with mode
|
| 290 |
+
|
| 291 |
+
Returns:
|
| 292 |
+
pd.DataFrame: Cleaned DataFrame
|
| 293 |
+
"""
|
| 294 |
+
if strategy == 'none':
|
| 295 |
+
return df.copy()
|
| 296 |
+
|
| 297 |
+
df_cleaned = df.copy()
|
| 298 |
+
|
| 299 |
+
if strategy == 'drop':
|
| 300 |
+
df_cleaned = df_cleaned.dropna()
|
| 301 |
+
logger.info(f"Dropped rows with missing values. Remaining rows: {len(df_cleaned)}")
|
| 302 |
+
|
| 303 |
+
elif strategy == 'fill_mean':
|
| 304 |
+
numerical_cols = df_cleaned.select_dtypes(include=[np.number]).columns
|
| 305 |
+
for col in numerical_cols:
|
| 306 |
+
df_cleaned[col].fillna(df_cleaned[col].mean(), inplace=True)
|
| 307 |
+
logger.info(f"Filled missing values with mean for {len(numerical_cols)} columns")
|
| 308 |
+
|
| 309 |
+
elif strategy == 'fill_median':
|
| 310 |
+
numerical_cols = df_cleaned.select_dtypes(include=[np.number]).columns
|
| 311 |
+
for col in numerical_cols:
|
| 312 |
+
df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True)
|
| 313 |
+
logger.info(f"Filled missing values with median for {len(numerical_cols)} columns")
|
| 314 |
+
|
| 315 |
+
elif strategy == 'fill_mode':
|
| 316 |
+
for col in df_cleaned.columns:
|
| 317 |
+
if df_cleaned[col].dtype == 'object':
|
| 318 |
+
mode_value = df_cleaned[col].mode()
|
| 319 |
+
if len(mode_value) > 0:
|
| 320 |
+
df_cleaned[col].fillna(mode_value[0], inplace=True)
|
| 321 |
+
logger.info("Filled missing values with mode for categorical columns")
|
| 322 |
+
|
| 323 |
+
return df_cleaned
|
| 324 |
+
|
| 325 |
+
@staticmethod
|
| 326 |
+
def convert_data_types(df: pd.DataFrame) -> pd.DataFrame:
|
| 327 |
+
"""
|
| 328 |
+
Automatically convert data types (dates, currencies, etc.).
|
| 329 |
+
|
| 330 |
+
Args:
|
| 331 |
+
df: DataFrame to convert
|
| 332 |
+
|
| 333 |
+
Returns:
|
| 334 |
+
pd.DataFrame: DataFrame with converted types
|
| 335 |
+
"""
|
| 336 |
+
df_converted = df.copy()
|
| 337 |
+
|
| 338 |
+
# Detect and convert date columns
|
| 339 |
+
date_columns = detect_date_columns(df_converted)
|
| 340 |
+
for col in date_columns:
|
| 341 |
+
try:
|
| 342 |
+
df_converted[col] = pd.to_datetime(df_converted[col], errors='coerce')
|
| 343 |
+
logger.info(f"Converted column '{col}' to datetime")
|
| 344 |
+
except Exception as e:
|
| 345 |
+
logger.warning(f"Could not convert '{col}' to datetime: {e}")
|
| 346 |
+
|
| 347 |
+
# Detect and convert currency columns
|
| 348 |
+
for col in df_converted.select_dtypes(include=['object']).columns:
|
| 349 |
+
# Check if column contains currency symbols
|
| 350 |
+
sample = df_converted[col].dropna().head(10).astype(str)
|
| 351 |
+
if any(any(symbol in str(val) for symbol in ['$', '€', '£', '¥']) for val in sample):
|
| 352 |
+
try:
|
| 353 |
+
df_converted[col] = clean_currency_column(df_converted[col])
|
| 354 |
+
logger.info(f"Converted column '{col}' from currency to numeric")
|
| 355 |
+
except Exception as e:
|
| 356 |
+
logger.warning(f"Could not convert '{col}' from currency: {e}")
|
| 357 |
+
|
| 358 |
+
# Convert boolean strings to actual booleans
|
| 359 |
+
for col in df_converted.select_dtypes(include=['object']).columns:
|
| 360 |
+
unique_values = df_converted[col].dropna().unique()
|
| 361 |
+
if len(unique_values) <= 2 and all(
|
| 362 |
+
str(v).upper() in ['TRUE', 'FALSE', 'YES', 'NO', '0', '1'] for v in unique_values):
|
| 363 |
+
try:
|
| 364 |
+
df_converted[col] = df_converted[col].map({
|
| 365 |
+
'TRUE': True, 'FALSE': False,
|
| 366 |
+
'YES': True, 'NO': False,
|
| 367 |
+
'True': True, 'False': False,
|
| 368 |
+
'true': True, 'false': False,
|
| 369 |
+
'1': True, '0': False
|
| 370 |
+
})
|
| 371 |
+
logger.info(f"Converted column '{col}' to boolean")
|
| 372 |
+
except Exception as e:
|
| 373 |
+
logger.warning(f"Could not convert '{col}' to boolean: {e}")
|
| 374 |
+
|
| 375 |
+
return df_converted
|
| 376 |
+
|
| 377 |
+
@staticmethod
|
| 378 |
+
def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
|
| 379 |
+
"""
|
| 380 |
+
Remove duplicate rows from DataFrame.
|
| 381 |
+
|
| 382 |
+
Args:
|
| 383 |
+
df: DataFrame to clean
|
| 384 |
+
|
| 385 |
+
Returns:
|
| 386 |
+
pd.DataFrame: DataFrame without duplicates
|
| 387 |
+
"""
|
| 388 |
+
initial_rows = len(df)
|
| 389 |
+
df_cleaned = df.drop_duplicates()
|
| 390 |
+
removed_rows = initial_rows - len(df_cleaned)
|
| 391 |
+
|
| 392 |
+
if removed_rows > 0:
|
| 393 |
+
logger.info(f"Removed {removed_rows} duplicate rows")
|
| 394 |
+
|
| 395 |
+
return df_cleaned
|
| 396 |
+
|
| 397 |
+
@staticmethod
|
| 398 |
+
def handle_outliers(df: pd.DataFrame, columns: List[str], method: str = 'zscore',
|
| 399 |
+
threshold: float = 3.0) -> pd.DataFrame:
|
| 400 |
+
"""
|
| 401 |
+
Handle outliers in numerical columns.
|
| 402 |
+
|
| 403 |
+
Args:
|
| 404 |
+
df: DataFrame to process
|
| 405 |
+
columns: List of columns to check for outliers
|
| 406 |
+
method: Method for outlier detection ('zscore' or 'iqr')
|
| 407 |
+
threshold: Threshold for outlier detection
|
| 408 |
+
|
| 409 |
+
Returns:
|
| 410 |
+
pd.DataFrame: DataFrame with outliers handled
|
| 411 |
+
"""
|
| 412 |
+
df_cleaned = df.copy()
|
| 413 |
+
|
| 414 |
+
for col in columns:
|
| 415 |
+
if col not in df_cleaned.columns:
|
| 416 |
+
continue
|
| 417 |
+
|
| 418 |
+
if not pd.api.types.is_numeric_dtype(df_cleaned[col]):
|
| 419 |
+
continue
|
| 420 |
+
|
| 421 |
+
if method == 'zscore':
|
| 422 |
+
# Z-score method
|
| 423 |
+
z_scores = np.abs((df_cleaned[col] - df_cleaned[col].mean()) / df_cleaned[col].std())
|
| 424 |
+
df_cleaned = df_cleaned[z_scores < threshold]
|
| 425 |
+
|
| 426 |
+
elif method == 'iqr':
|
| 427 |
+
# IQR method
|
| 428 |
+
Q1 = df_cleaned[col].quantile(0.25)
|
| 429 |
+
Q3 = df_cleaned[col].quantile(0.75)
|
| 430 |
+
IQR = Q3 - Q1
|
| 431 |
+
lower_bound = Q1 - threshold * IQR
|
| 432 |
+
upper_bound = Q3 + threshold * IQR
|
| 433 |
+
df_cleaned = df_cleaned[(df_cleaned[col] >= lower_bound) & (df_cleaned[col] <= upper_bound)]
|
| 434 |
+
|
| 435 |
+
removed_rows = len(df) - len(df_cleaned)
|
| 436 |
+
if removed_rows > 0:
|
| 437 |
+
logger.info(f"Removed {removed_rows} outlier rows")
|
| 438 |
+
|
| 439 |
+
return df_cleaned
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
# ============================================================================
|
| 443 |
+
# DATA PROFILER
|
| 444 |
+
# Generates comprehensive statistics about the dataset
|
| 445 |
+
# ============================================================================
|
| 446 |
+
|
| 447 |
+
class DataProfiler:
|
| 448 |
+
"""
|
| 449 |
+
Generates comprehensive data profiling statistics.
|
| 450 |
+
Follows Single Responsibility Principle - only responsible for profiling.
|
| 451 |
+
"""
|
| 452 |
+
|
| 453 |
+
def __init__(self, df: pd.DataFrame):
|
| 454 |
+
"""
|
| 455 |
+
Initialize profiler with DataFrame.
|
| 456 |
+
|
| 457 |
+
Args:
|
| 458 |
+
df: DataFrame to profile
|
| 459 |
+
"""
|
| 460 |
+
self.df = df
|
| 461 |
+
self.validator = DataFrameValidator()
|
| 462 |
+
self.validator.validate(df)
|
| 463 |
+
|
| 464 |
+
def get_basic_info(self) -> Dict[str, Any]:
|
| 465 |
+
"""
|
| 466 |
+
Get basic information about the dataset.
|
| 467 |
+
|
| 468 |
+
Returns:
|
| 469 |
+
Dict with shape, columns, data types, and memory usage
|
| 470 |
+
"""
|
| 471 |
+
return {
|
| 472 |
+
'rows': len(self.df),
|
| 473 |
+
'columns': len(self.df.columns),
|
| 474 |
+
'column_names': self.df.columns.tolist(),
|
| 475 |
+
'data_types': self.df.dtypes.to_dict(),
|
| 476 |
+
'memory_usage': f"{self.df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB"
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
def get_missing_values_report(self) -> pd.DataFrame:
|
| 480 |
+
"""
|
| 481 |
+
Generate report on missing values.
|
| 482 |
+
|
| 483 |
+
Returns:
|
| 484 |
+
DataFrame with missing value statistics per column
|
| 485 |
+
"""
|
| 486 |
+
missing_data = pd.DataFrame({
|
| 487 |
+
'Column': self.df.columns,
|
| 488 |
+
'Missing_Count': self.df.isnull().sum().values,
|
| 489 |
+
'Missing_Percentage': (self.df.isnull().sum().values / len(self.df) * 100).round(2)
|
| 490 |
+
})
|
| 491 |
+
|
| 492 |
+
return missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)
|
| 493 |
+
|
| 494 |
+
def get_numerical_summary(self) -> pd.DataFrame:
|
| 495 |
+
"""
|
| 496 |
+
Get summary statistics for numerical columns.
|
| 497 |
+
|
| 498 |
+
Returns:
|
| 499 |
+
DataFrame with descriptive statistics
|
| 500 |
+
"""
|
| 501 |
+
numerical_cols = self.df.select_dtypes(include=[np.number]).columns
|
| 502 |
+
|
| 503 |
+
if len(numerical_cols) == 0:
|
| 504 |
+
return pd.DataFrame()
|
| 505 |
+
|
| 506 |
+
return self.df[numerical_cols].describe()
|
| 507 |
+
|
| 508 |
+
def get_categorical_summary(self) -> Dict[str, Dict[str, Any]]:
|
| 509 |
+
"""
|
| 510 |
+
Get summary statistics for categorical columns.
|
| 511 |
+
|
| 512 |
+
Returns:
|
| 513 |
+
Dict with statistics for each categorical column
|
| 514 |
+
"""
|
| 515 |
+
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns
|
| 516 |
+
|
| 517 |
+
summary = {}
|
| 518 |
+
for col in categorical_cols:
|
| 519 |
+
# Get value counts, dropping NaN values
|
| 520 |
+
value_counts = self.df[col].value_counts()
|
| 521 |
+
|
| 522 |
+
# Safely get mode
|
| 523 |
+
mode_values = self.df[col].mode()
|
| 524 |
+
top_value = mode_values.iloc[0] if len(mode_values) > 0 and not mode_values.empty else None
|
| 525 |
+
|
| 526 |
+
# Safely get top frequency
|
| 527 |
+
top_freq = value_counts.iloc[0] if len(value_counts) > 0 else 0
|
| 528 |
+
|
| 529 |
+
summary[col] = {
|
| 530 |
+
'unique_count': self.df[col].nunique(),
|
| 531 |
+
'top_value': top_value,
|
| 532 |
+
'top_value_frequency': top_freq,
|
| 533 |
+
'value_counts': value_counts.head(10).to_dict()
|
| 534 |
+
}
|
| 535 |
+
|
| 536 |
+
return summary
|
| 537 |
+
|
| 538 |
+
def get_correlation_matrix(self) -> pd.DataFrame:
|
| 539 |
+
"""
|
| 540 |
+
Get correlation matrix for numerical columns.
|
| 541 |
+
|
| 542 |
+
Returns:
|
| 543 |
+
Correlation matrix DataFrame
|
| 544 |
+
"""
|
| 545 |
+
numerical_cols = self.df.select_dtypes(include=[np.number]).columns
|
| 546 |
+
|
| 547 |
+
if len(numerical_cols) < 2:
|
| 548 |
+
return pd.DataFrame()
|
| 549 |
+
|
| 550 |
+
return self.df[numerical_cols].corr()
|
| 551 |
+
|
| 552 |
+
def get_full_profile(self) -> Dict[str, Any]:
|
| 553 |
+
"""
|
| 554 |
+
Get comprehensive data profile.
|
| 555 |
+
|
| 556 |
+
Returns:
|
| 557 |
+
Dict with all profiling information
|
| 558 |
+
"""
|
| 559 |
+
return {
|
| 560 |
+
'basic_info': self.get_basic_info(),
|
| 561 |
+
'missing_values': self.get_missing_values_report(),
|
| 562 |
+
'numerical_summary': self.get_numerical_summary(),
|
| 563 |
+
'categorical_summary': self.get_categorical_summary(),
|
| 564 |
+
'correlation_matrix': self.get_correlation_matrix()
|
| 565 |
+
}
|
| 566 |
+
|
| 567 |
+
|
| 568 |
+
# ============================================================================
|
| 569 |
+
# DATA FILTER
|
| 570 |
+
# Handles interactive filtering operations
|
| 571 |
+
# ============================================================================
|
| 572 |
+
|
| 573 |
+
class DataFilter:
|
| 574 |
+
"""
|
| 575 |
+
Handles data filtering operations.
|
| 576 |
+
Follows Single Responsibility Principle - only responsible for filtering.
|
| 577 |
+
"""
|
| 578 |
+
|
| 579 |
+
@staticmethod
|
| 580 |
+
def filter_numerical(df: pd.DataFrame, column: str, min_val: Optional[float] = None,
|
| 581 |
+
max_val: Optional[float] = None) -> pd.DataFrame:
|
| 582 |
+
"""
|
| 583 |
+
Filter DataFrame by numerical column range.
|
| 584 |
+
|
| 585 |
+
Args:
|
| 586 |
+
df: DataFrame to filter
|
| 587 |
+
column: Column name to filter
|
| 588 |
+
min_val: Minimum value (inclusive)
|
| 589 |
+
max_val: Maximum value (inclusive)
|
| 590 |
+
|
| 591 |
+
Returns:
|
| 592 |
+
Filtered DataFrame
|
| 593 |
+
"""
|
| 594 |
+
ColumnValidator().validate(df, column)
|
| 595 |
+
|
| 596 |
+
filtered_df = df.copy()
|
| 597 |
+
|
| 598 |
+
if min_val is not None:
|
| 599 |
+
filtered_df = filtered_df[filtered_df[column] >= min_val]
|
| 600 |
+
|
| 601 |
+
if max_val is not None:
|
| 602 |
+
filtered_df = filtered_df[filtered_df[column] <= max_val]
|
| 603 |
+
|
| 604 |
+
logger.info(f"Filtered by {column}: {len(filtered_df)} rows remaining")
|
| 605 |
+
return filtered_df
|
| 606 |
+
|
| 607 |
+
@staticmethod
|
| 608 |
+
def filter_categorical(df: pd.DataFrame, column: str, values: List[Any]) -> pd.DataFrame:
|
| 609 |
+
"""
|
| 610 |
+
Filter DataFrame by categorical column values.
|
| 611 |
+
|
| 612 |
+
Args:
|
| 613 |
+
df: DataFrame to filter
|
| 614 |
+
column: Column name to filter
|
| 615 |
+
values: List of values to keep
|
| 616 |
+
|
| 617 |
+
Returns:
|
| 618 |
+
Filtered DataFrame
|
| 619 |
+
"""
|
| 620 |
+
ColumnValidator().validate(df, column)
|
| 621 |
+
|
| 622 |
+
if not values:
|
| 623 |
+
return df.copy()
|
| 624 |
+
|
| 625 |
+
filtered_df = df[df[column].isin(values)]
|
| 626 |
+
logger.info(f"Filtered by {column}: {len(filtered_df)} rows remaining")
|
| 627 |
+
return filtered_df
|
| 628 |
+
|
| 629 |
+
@staticmethod
|
| 630 |
+
def filter_date_range(df: pd.DataFrame, column: str, start_date: Optional[datetime] = None,
|
| 631 |
+
end_date: Optional[datetime] = None) -> pd.DataFrame:
|
| 632 |
+
"""
|
| 633 |
+
Filter DataFrame by date range.
|
| 634 |
+
|
| 635 |
+
Args:
|
| 636 |
+
df: DataFrame to filter
|
| 637 |
+
column: Date column name
|
| 638 |
+
start_date: Start date (inclusive)
|
| 639 |
+
end_date: End date (inclusive)
|
| 640 |
+
|
| 641 |
+
Returns:
|
| 642 |
+
Filtered DataFrame
|
| 643 |
+
"""
|
| 644 |
+
ColumnValidator().validate(df, column)
|
| 645 |
+
|
| 646 |
+
filtered_df = df.copy()
|
| 647 |
+
|
| 648 |
+
# Ensure column is datetime
|
| 649 |
+
if not pd.api.types.is_datetime64_any_dtype(filtered_df[column]):
|
| 650 |
+
filtered_df[column] = pd.to_datetime(filtered_df[column], errors='coerce')
|
| 651 |
+
|
| 652 |
+
if start_date is not None:
|
| 653 |
+
filtered_df = filtered_df[filtered_df[column] >= start_date]
|
| 654 |
+
|
| 655 |
+
if end_date is not None:
|
| 656 |
+
filtered_df = filtered_df[filtered_df[column] <= end_date]
|
| 657 |
+
|
| 658 |
+
logger.info(f"Filtered by date range on {column}: {len(filtered_df)} rows remaining")
|
| 659 |
+
return filtered_df
|
| 660 |
+
|
| 661 |
+
@staticmethod
|
| 662 |
+
def apply_multiple_filters(df: pd.DataFrame, filters: List[Dict[str, Any]]) -> pd.DataFrame:
|
| 663 |
+
"""
|
| 664 |
+
Apply multiple filters sequentially.
|
| 665 |
+
|
| 666 |
+
Args:
|
| 667 |
+
df: DataFrame to filter
|
| 668 |
+
filters: List of filter dictionaries with keys:
|
| 669 |
+
- 'type': 'numerical', 'categorical', or 'date'
|
| 670 |
+
- 'column': column name
|
| 671 |
+
- other keys depending on filter type
|
| 672 |
+
|
| 673 |
+
Returns:
|
| 674 |
+
Filtered DataFrame
|
| 675 |
+
"""
|
| 676 |
+
filtered_df = df.copy()
|
| 677 |
+
|
| 678 |
+
for filter_config in filters:
|
| 679 |
+
filter_type = filter_config.get('type')
|
| 680 |
+
column = filter_config.get('column')
|
| 681 |
+
|
| 682 |
+
if filter_type == 'numerical':
|
| 683 |
+
filtered_df = DataFilter.filter_numerical(
|
| 684 |
+
filtered_df,
|
| 685 |
+
column,
|
| 686 |
+
filter_config.get('min_val'),
|
| 687 |
+
filter_config.get('max_val')
|
| 688 |
+
)
|
| 689 |
+
|
| 690 |
+
elif filter_type == 'categorical':
|
| 691 |
+
filtered_df = DataFilter.filter_categorical(
|
| 692 |
+
filtered_df,
|
| 693 |
+
column,
|
| 694 |
+
filter_config.get('values', [])
|
| 695 |
+
)
|
| 696 |
+
|
| 697 |
+
elif filter_type == 'date':
|
| 698 |
+
filtered_df = DataFilter.filter_date_range(
|
| 699 |
+
filtered_df,
|
| 700 |
+
column,
|
| 701 |
+
filter_config.get('start_date'),
|
| 702 |
+
filter_config.get('end_date')
|
| 703 |
+
)
|
| 704 |
+
|
| 705 |
+
return filtered_df
|
| 706 |
+
|
| 707 |
+
|
| 708 |
+
# ============================================================================
|
| 709 |
+
# MAIN DATA PROCESSOR CLASS
|
| 710 |
+
# Facade pattern - provides simple interface to complex subsystems
|
| 711 |
+
# ============================================================================
|
| 712 |
+
|
| 713 |
+
class DataProcessor:
|
| 714 |
+
"""
|
| 715 |
+
Main data processor class using Facade pattern.
|
| 716 |
+
Provides simple interface to complex data loading, cleaning, and filtering operations.
|
| 717 |
+
Follows Dependency Inversion Principle - depends on abstractions, not concrete implementations.
|
| 718 |
+
"""
|
| 719 |
+
|
| 720 |
+
def __init__(self):
|
| 721 |
+
"""Initialize DataProcessor with all components."""
|
| 722 |
+
self.loader = DataLoader()
|
| 723 |
+
self.cleaner = DataCleaner()
|
| 724 |
+
self.filter = DataFilter()
|
| 725 |
+
self.current_df: Optional[pd.DataFrame] = None
|
| 726 |
+
self.original_df: Optional[pd.DataFrame] = None
|
| 727 |
+
self.profiler: Optional[DataProfiler] = None
|
| 728 |
+
|
| 729 |
+
def load_and_prepare_data(self, filepath: Union[str, Path],
|
| 730 |
+
clean: bool = True,
|
| 731 |
+
remove_duplicates: bool = True) -> pd.DataFrame:
|
| 732 |
+
"""
|
| 733 |
+
Load and prepare data with automatic cleaning.
|
| 734 |
+
|
| 735 |
+
Args:
|
| 736 |
+
filepath: Path to data file
|
| 737 |
+
clean: Whether to apply automatic type conversion
|
| 738 |
+
remove_duplicates: Whether to remove duplicate rows
|
| 739 |
+
|
| 740 |
+
Returns:
|
| 741 |
+
Prepared DataFrame
|
| 742 |
+
"""
|
| 743 |
+
# Load data
|
| 744 |
+
df = self.loader.load_data(filepath)
|
| 745 |
+
self.original_df = df.copy()
|
| 746 |
+
|
| 747 |
+
# Clean data
|
| 748 |
+
if clean:
|
| 749 |
+
df = self.cleaner.convert_data_types(df)
|
| 750 |
+
|
| 751 |
+
if remove_duplicates:
|
| 752 |
+
df = self.cleaner.remove_duplicates(df)
|
| 753 |
+
|
| 754 |
+
self.current_df = df
|
| 755 |
+
self.profiler = DataProfiler(df)
|
| 756 |
+
|
| 757 |
+
logger.info("Data loaded and prepared successfully")
|
| 758 |
+
return df
|
| 759 |
+
|
| 760 |
+
def get_data_profile(self) -> Dict[str, Any]:
|
| 761 |
+
"""
|
| 762 |
+
Get comprehensive data profile.
|
| 763 |
+
|
| 764 |
+
Returns:
|
| 765 |
+
Dict with profiling information
|
| 766 |
+
"""
|
| 767 |
+
if self.profiler is None:
|
| 768 |
+
raise ValueError("No data loaded. Call load_and_prepare_data first.")
|
| 769 |
+
|
| 770 |
+
return self.profiler.get_full_profile()
|
| 771 |
+
|
| 772 |
+
def apply_filters(self, filters: List[Dict[str, Any]]) -> pd.DataFrame:
|
| 773 |
+
"""
|
| 774 |
+
Apply filters to current data.
|
| 775 |
+
|
| 776 |
+
Args:
|
| 777 |
+
filters: List of filter configurations
|
| 778 |
+
|
| 779 |
+
Returns:
|
| 780 |
+
Filtered DataFrame
|
| 781 |
+
"""
|
| 782 |
+
if self.current_df is None:
|
| 783 |
+
raise ValueError("No data loaded. Call load_and_prepare_data first.")
|
| 784 |
+
|
| 785 |
+
return self.filter.apply_multiple_filters(self.current_df, filters)
|
| 786 |
+
|
| 787 |
+
def reset_to_original(self) -> pd.DataFrame:
|
| 788 |
+
"""
|
| 789 |
+
Reset current data to original loaded data.
|
| 790 |
+
|
| 791 |
+
Returns:
|
| 792 |
+
Original DataFrame
|
| 793 |
+
"""
|
| 794 |
+
if self.original_df is None:
|
| 795 |
+
raise ValueError("No data loaded. Call load_and_prepare_data first.")
|
| 796 |
+
|
| 797 |
+
self.current_df = self.original_df.copy()
|
| 798 |
+
return self.current_df
|
| 799 |
+
|
| 800 |
+
def get_column_info(self) -> Dict[str, List[str]]:
|
| 801 |
+
"""
|
| 802 |
+
Get categorized column information.
|
| 803 |
+
|
| 804 |
+
Returns:
|
| 805 |
+
Dict with numerical, categorical, and datetime columns
|
| 806 |
+
"""
|
| 807 |
+
if self.current_df is None:
|
| 808 |
+
raise ValueError("No data loaded. Call load_and_prepare_data first.")
|
| 809 |
+
|
| 810 |
+
return get_column_types(self.current_df)
|
| 811 |
+
|
| 812 |
+
|
| 813 |
+
if __name__ == "__main__":
|
| 814 |
+
# Example usage
|
| 815 |
+
print("DataProcessor module loaded successfully")
|
| 816 |
+
|
| 817 |
+
# Demonstrate Strategy Pattern
|
| 818 |
+
processor = DataProcessor()
|
| 819 |
+
print(f"Available strategies: {len(processor.loader.strategies)}")
|
insights.py
ADDED
|
@@ -0,0 +1,897 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Insights Module for Business Intelligence Dashboard
|
| 3 |
+
|
| 4 |
+
This module handles automated insight generation from data.
|
| 5 |
+
Uses Strategy Pattern for different types of insights.
|
| 6 |
+
|
| 7 |
+
Author: Craig
|
| 8 |
+
Date: December 2024
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import numpy as np
|
| 13 |
+
from typing import Union, List, Dict, Optional, Any, Tuple
|
| 14 |
+
from abc import ABC, abstractmethod
|
| 15 |
+
import logging
|
| 16 |
+
from datetime import datetime, timedelta
|
| 17 |
+
|
| 18 |
+
from utils import (
|
| 19 |
+
DataFrameValidator, ColumnValidator,
|
| 20 |
+
format_number, format_percentage, safe_divide,
|
| 21 |
+
get_column_types
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Configure logging
|
| 25 |
+
logging.basicConfig(level=logging.INFO)
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ============================================================================
|
| 30 |
+
# STRATEGY PATTERN - Insight Strategies
|
| 31 |
+
# Follows Open/Closed Principle and Strategy Pattern
|
| 32 |
+
# ============================================================================
|
| 33 |
+
|
| 34 |
+
class InsightStrategy(ABC):
|
| 35 |
+
"""
|
| 36 |
+
Abstract base class for insight generation strategies.
|
| 37 |
+
Follows Strategy Pattern - allows different insight algorithms.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
@abstractmethod
|
| 41 |
+
def generate(self, df: pd.DataFrame, **kwargs) -> Dict[str, Any]:
|
| 42 |
+
"""
|
| 43 |
+
Generate insights from data.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
df: DataFrame to analyze
|
| 47 |
+
**kwargs: Additional parameters for insight generation
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
Dict containing insight information
|
| 51 |
+
"""
|
| 52 |
+
pass
|
| 53 |
+
|
| 54 |
+
@abstractmethod
|
| 55 |
+
def get_insight_type(self) -> str:
|
| 56 |
+
"""
|
| 57 |
+
Get the type of insight this strategy generates.
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
str: Insight type name
|
| 61 |
+
"""
|
| 62 |
+
pass
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# ============================================================================
|
| 66 |
+
# TOP/BOTTOM PERFORMERS INSIGHTS
|
| 67 |
+
# ============================================================================
|
| 68 |
+
|
| 69 |
+
class TopBottomPerformers(InsightStrategy):
|
| 70 |
+
"""
|
| 71 |
+
Identify top and bottom performers in the data.
|
| 72 |
+
Follows Single Responsibility Principle - only handles top/bottom analysis.
|
| 73 |
+
"""
|
| 74 |
+
|
| 75 |
+
def get_insight_type(self) -> str:
|
| 76 |
+
"""Get insight type."""
|
| 77 |
+
return "top_bottom_performers"
|
| 78 |
+
|
| 79 |
+
def generate(self, df: pd.DataFrame,
|
| 80 |
+
column: str,
|
| 81 |
+
group_by: Optional[str] = None,
|
| 82 |
+
top_n: int = 5,
|
| 83 |
+
bottom_n: int = 5,
|
| 84 |
+
aggregation: str = 'sum',
|
| 85 |
+
**kwargs) -> Dict[str, Any]:
|
| 86 |
+
"""
|
| 87 |
+
Generate top and bottom performer insights.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
df: DataFrame to analyze
|
| 91 |
+
column: Column to analyze for performance
|
| 92 |
+
group_by: Optional column to group by
|
| 93 |
+
top_n: Number of top performers to identify
|
| 94 |
+
bottom_n: Number of bottom performers to identify
|
| 95 |
+
aggregation: Aggregation method if group_by is used
|
| 96 |
+
**kwargs: Additional parameters
|
| 97 |
+
|
| 98 |
+
Returns:
|
| 99 |
+
Dict with top and bottom performers
|
| 100 |
+
"""
|
| 101 |
+
# Validate inputs
|
| 102 |
+
DataFrameValidator().validate(df)
|
| 103 |
+
ColumnValidator().validate(df, column)
|
| 104 |
+
|
| 105 |
+
if group_by:
|
| 106 |
+
ColumnValidator().validate(df, group_by)
|
| 107 |
+
|
| 108 |
+
# Aggregate by group
|
| 109 |
+
if aggregation == 'sum':
|
| 110 |
+
data = df.groupby(group_by)[column].sum().sort_values(ascending=False)
|
| 111 |
+
elif aggregation == 'mean':
|
| 112 |
+
data = df.groupby(group_by)[column].mean().sort_values(ascending=False)
|
| 113 |
+
elif aggregation == 'count':
|
| 114 |
+
data = df.groupby(group_by)[column].count().sort_values(ascending=False)
|
| 115 |
+
elif aggregation == 'median':
|
| 116 |
+
data = df.groupby(group_by)[column].median().sort_values(ascending=False)
|
| 117 |
+
else:
|
| 118 |
+
data = df.groupby(group_by)[column].sum().sort_values(ascending=False)
|
| 119 |
+
else:
|
| 120 |
+
# Direct analysis on column
|
| 121 |
+
data = df[column].sort_values(ascending=False)
|
| 122 |
+
|
| 123 |
+
# Get top and bottom performers
|
| 124 |
+
top_performers = data.head(top_n)
|
| 125 |
+
bottom_performers = data.tail(bottom_n).sort_values(ascending=True)
|
| 126 |
+
|
| 127 |
+
# Calculate statistics
|
| 128 |
+
total = data.sum()
|
| 129 |
+
top_contribution = safe_divide(top_performers.sum(), total) if total != 0 else 0
|
| 130 |
+
bottom_contribution = safe_divide(bottom_performers.sum(), total) if total != 0 else 0
|
| 131 |
+
|
| 132 |
+
insight = {
|
| 133 |
+
'type': self.get_insight_type(),
|
| 134 |
+
'column': column,
|
| 135 |
+
'group_by': group_by,
|
| 136 |
+
'aggregation': aggregation if group_by else 'direct',
|
| 137 |
+
'top_performers': {
|
| 138 |
+
'data': top_performers.to_dict(),
|
| 139 |
+
'count': len(top_performers),
|
| 140 |
+
'total_value': top_performers.sum(),
|
| 141 |
+
'contribution_percentage': top_contribution
|
| 142 |
+
},
|
| 143 |
+
'bottom_performers': {
|
| 144 |
+
'data': bottom_performers.to_dict(),
|
| 145 |
+
'count': len(bottom_performers),
|
| 146 |
+
'total_value': bottom_performers.sum(),
|
| 147 |
+
'contribution_percentage': bottom_contribution
|
| 148 |
+
},
|
| 149 |
+
'summary': self._generate_summary(
|
| 150 |
+
column, group_by, top_performers, bottom_performers,
|
| 151 |
+
top_contribution, bottom_contribution
|
| 152 |
+
)
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
logger.info(f"Generated top/bottom performers insight for {column}")
|
| 156 |
+
return insight
|
| 157 |
+
|
| 158 |
+
def _generate_summary(self, column: str, group_by: Optional[str],
|
| 159 |
+
top: pd.Series, bottom: pd.Series,
|
| 160 |
+
top_contrib: float, bottom_contrib: float) -> str:
|
| 161 |
+
"""Generate human-readable summary."""
|
| 162 |
+
if group_by:
|
| 163 |
+
top_name = top.index[0] if len(top) > 0 else "N/A"
|
| 164 |
+
bottom_name = bottom.index[0] if len(bottom) > 0 else "N/A"
|
| 165 |
+
|
| 166 |
+
summary = f"Top performer in {column}: '{top_name}' with {format_number(top.iloc[0])}. "
|
| 167 |
+
summary += f"Bottom performer: '{bottom_name}' with {format_number(bottom.iloc[0])}. "
|
| 168 |
+
summary += f"Top {len(top)} performers contribute {format_percentage(top_contrib)} of total."
|
| 169 |
+
else:
|
| 170 |
+
summary = f"Highest value in {column}: {format_number(top.iloc[0])}. "
|
| 171 |
+
summary += f"Lowest value: {format_number(bottom.iloc[0])}. "
|
| 172 |
+
summary += f"Range: {format_number(top.iloc[0] - bottom.iloc[0])}"
|
| 173 |
+
|
| 174 |
+
return summary
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
# ============================================================================
|
| 178 |
+
# TREND ANALYSIS INSIGHTS
|
| 179 |
+
# ============================================================================
|
| 180 |
+
|
| 181 |
+
class TrendAnalysis(InsightStrategy):
|
| 182 |
+
"""
|
| 183 |
+
Analyze trends in time series data.
|
| 184 |
+
Follows Single Responsibility Principle - only handles trend analysis.
|
| 185 |
+
"""
|
| 186 |
+
|
| 187 |
+
def get_insight_type(self) -> str:
|
| 188 |
+
"""Get insight type."""
|
| 189 |
+
return "trend_analysis"
|
| 190 |
+
|
| 191 |
+
def generate(self, df: pd.DataFrame,
|
| 192 |
+
date_column: str,
|
| 193 |
+
value_column: str,
|
| 194 |
+
period: str = 'overall',
|
| 195 |
+
**kwargs) -> Dict[str, Any]:
|
| 196 |
+
"""
|
| 197 |
+
Generate trend analysis insights.
|
| 198 |
+
|
| 199 |
+
Args:
|
| 200 |
+
df: DataFrame to analyze
|
| 201 |
+
date_column: Column containing dates
|
| 202 |
+
value_column: Column containing values
|
| 203 |
+
period: Analysis period ('overall', 'monthly', 'weekly', 'daily')
|
| 204 |
+
**kwargs: Additional parameters
|
| 205 |
+
|
| 206 |
+
Returns:
|
| 207 |
+
Dict with trend insights
|
| 208 |
+
"""
|
| 209 |
+
# Validate inputs
|
| 210 |
+
DataFrameValidator().validate(df)
|
| 211 |
+
ColumnValidator().validate(df, [date_column, value_column])
|
| 212 |
+
|
| 213 |
+
# Prepare data
|
| 214 |
+
df_trend = df[[date_column, value_column]].copy()
|
| 215 |
+
|
| 216 |
+
# Ensure date column is datetime
|
| 217 |
+
if not pd.api.types.is_datetime64_any_dtype(df_trend[date_column]):
|
| 218 |
+
df_trend[date_column] = pd.to_datetime(df_trend[date_column], errors='coerce')
|
| 219 |
+
|
| 220 |
+
# Remove NaN values
|
| 221 |
+
df_trend = df_trend.dropna()
|
| 222 |
+
|
| 223 |
+
if len(df_trend) < 2:
|
| 224 |
+
return {
|
| 225 |
+
'type': self.get_insight_type(),
|
| 226 |
+
'error': 'Insufficient data for trend analysis',
|
| 227 |
+
'summary': 'Not enough data points to analyze trends.'
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
# Sort by date
|
| 231 |
+
df_trend = df_trend.sort_values(date_column)
|
| 232 |
+
|
| 233 |
+
# Calculate trend metrics
|
| 234 |
+
first_value = df_trend[value_column].iloc[0]
|
| 235 |
+
last_value = df_trend[value_column].iloc[-1]
|
| 236 |
+
change = last_value - first_value
|
| 237 |
+
change_pct = safe_divide(change, first_value)
|
| 238 |
+
|
| 239 |
+
# Determine trend direction
|
| 240 |
+
if change > 0:
|
| 241 |
+
trend_direction = 'increasing'
|
| 242 |
+
elif change < 0:
|
| 243 |
+
trend_direction = 'decreasing'
|
| 244 |
+
else:
|
| 245 |
+
trend_direction = 'stable'
|
| 246 |
+
|
| 247 |
+
# Calculate statistics
|
| 248 |
+
mean_value = df_trend[value_column].mean()
|
| 249 |
+
median_value = df_trend[value_column].median()
|
| 250 |
+
std_value = df_trend[value_column].std()
|
| 251 |
+
|
| 252 |
+
# Calculate growth rate (if applicable)
|
| 253 |
+
growth_rate = self._calculate_growth_rate(df_trend, date_column, value_column)
|
| 254 |
+
|
| 255 |
+
# Detect volatility
|
| 256 |
+
volatility = self._calculate_volatility(df_trend[value_column])
|
| 257 |
+
|
| 258 |
+
insight = {
|
| 259 |
+
'type': self.get_insight_type(),
|
| 260 |
+
'date_column': date_column,
|
| 261 |
+
'value_column': value_column,
|
| 262 |
+
'period': period,
|
| 263 |
+
'trend_direction': trend_direction,
|
| 264 |
+
'metrics': {
|
| 265 |
+
'first_value': first_value,
|
| 266 |
+
'last_value': last_value,
|
| 267 |
+
'absolute_change': change,
|
| 268 |
+
'percentage_change': change_pct,
|
| 269 |
+
'mean': mean_value,
|
| 270 |
+
'median': median_value,
|
| 271 |
+
'std_deviation': std_value,
|
| 272 |
+
'growth_rate': growth_rate,
|
| 273 |
+
'volatility': volatility
|
| 274 |
+
},
|
| 275 |
+
'date_range': {
|
| 276 |
+
'start': df_trend[date_column].min().strftime('%Y-%m-%d'),
|
| 277 |
+
'end': df_trend[date_column].max().strftime('%Y-%m-%d'),
|
| 278 |
+
'days': (df_trend[date_column].max() - df_trend[date_column].min()).days
|
| 279 |
+
},
|
| 280 |
+
'summary': self._generate_summary(
|
| 281 |
+
value_column, trend_direction, change, change_pct, volatility
|
| 282 |
+
)
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
logger.info(f"Generated trend analysis insight for {value_column}")
|
| 286 |
+
return insight
|
| 287 |
+
|
| 288 |
+
def _calculate_growth_rate(self, df: pd.DataFrame,
|
| 289 |
+
date_col: str, value_col: str) -> Optional[float]:
|
| 290 |
+
"""Calculate average growth rate."""
|
| 291 |
+
try:
|
| 292 |
+
# Simple linear regression for growth rate
|
| 293 |
+
x = (df[date_col] - df[date_col].min()).dt.days.values
|
| 294 |
+
y = df[value_col].values
|
| 295 |
+
|
| 296 |
+
if len(x) < 2:
|
| 297 |
+
return None
|
| 298 |
+
|
| 299 |
+
# Calculate slope
|
| 300 |
+
slope = np.polyfit(x, y, 1)[0]
|
| 301 |
+
return slope
|
| 302 |
+
except Exception:
|
| 303 |
+
return None
|
| 304 |
+
|
| 305 |
+
def _calculate_volatility(self, series: pd.Series) -> str:
|
| 306 |
+
"""Calculate volatility level."""
|
| 307 |
+
if len(series) < 2:
|
| 308 |
+
return 'unknown'
|
| 309 |
+
|
| 310 |
+
# Use coefficient of variation
|
| 311 |
+
cv = safe_divide(series.std(), series.mean())
|
| 312 |
+
|
| 313 |
+
if cv < 0.1:
|
| 314 |
+
return 'low'
|
| 315 |
+
elif cv < 0.3:
|
| 316 |
+
return 'moderate'
|
| 317 |
+
else:
|
| 318 |
+
return 'high'
|
| 319 |
+
|
| 320 |
+
def _generate_summary(self, column: str, direction: str,
|
| 321 |
+
change: float, change_pct: float, volatility: str) -> str:
|
| 322 |
+
"""Generate human-readable summary."""
|
| 323 |
+
summary = f"{column} shows a {direction} trend with "
|
| 324 |
+
summary += f"{format_percentage(abs(change_pct))} {'increase' if change > 0 else 'decrease'}. "
|
| 325 |
+
summary += f"Absolute change: {format_number(change)}. "
|
| 326 |
+
summary += f"Volatility: {volatility}."
|
| 327 |
+
return summary
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
# ============================================================================
|
| 331 |
+
# ANOMALY DETECTION INSIGHTS
|
| 332 |
+
# ============================================================================
|
| 333 |
+
|
| 334 |
+
class AnomalyDetection(InsightStrategy):
|
| 335 |
+
"""
|
| 336 |
+
Detect anomalies and outliers in data.
|
| 337 |
+
Follows Single Responsibility Principle - only handles anomaly detection.
|
| 338 |
+
"""
|
| 339 |
+
|
| 340 |
+
def get_insight_type(self) -> str:
|
| 341 |
+
"""Get insight type."""
|
| 342 |
+
return "anomaly_detection"
|
| 343 |
+
|
| 344 |
+
def generate(self, df: pd.DataFrame,
|
| 345 |
+
column: str,
|
| 346 |
+
method: str = 'zscore',
|
| 347 |
+
threshold: float = 3.0,
|
| 348 |
+
**kwargs) -> Dict[str, Any]:
|
| 349 |
+
"""
|
| 350 |
+
Generate anomaly detection insights.
|
| 351 |
+
|
| 352 |
+
Args:
|
| 353 |
+
df: DataFrame to analyze
|
| 354 |
+
column: Column to analyze for anomalies
|
| 355 |
+
method: Detection method ('zscore' or 'iqr')
|
| 356 |
+
threshold: Threshold for anomaly detection
|
| 357 |
+
**kwargs: Additional parameters
|
| 358 |
+
|
| 359 |
+
Returns:
|
| 360 |
+
Dict with anomaly insights
|
| 361 |
+
"""
|
| 362 |
+
# Validate inputs
|
| 363 |
+
DataFrameValidator().validate(df)
|
| 364 |
+
ColumnValidator().validate(df, column)
|
| 365 |
+
|
| 366 |
+
# Check if column is numerical
|
| 367 |
+
if not pd.api.types.is_numeric_dtype(df[column]):
|
| 368 |
+
return {
|
| 369 |
+
'type': self.get_insight_type(),
|
| 370 |
+
'error': f'Column {column} is not numerical',
|
| 371 |
+
'summary': f'Cannot detect anomalies in non-numerical column {column}.'
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
# Remove NaN values
|
| 375 |
+
data = df[column].dropna()
|
| 376 |
+
|
| 377 |
+
if len(data) < 3:
|
| 378 |
+
return {
|
| 379 |
+
'type': self.get_insight_type(),
|
| 380 |
+
'error': 'Insufficient data',
|
| 381 |
+
'summary': 'Not enough data points to detect anomalies.'
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
# Detect anomalies
|
| 385 |
+
if method == 'zscore':
|
| 386 |
+
anomalies_mask = self._detect_zscore(data, threshold)
|
| 387 |
+
elif method == 'iqr':
|
| 388 |
+
anomalies_mask = self._detect_iqr(data, threshold)
|
| 389 |
+
else:
|
| 390 |
+
raise ValueError(f"Unsupported method: {method}")
|
| 391 |
+
|
| 392 |
+
anomalies = data[anomalies_mask]
|
| 393 |
+
|
| 394 |
+
# Calculate statistics
|
| 395 |
+
total_points = len(data)
|
| 396 |
+
anomaly_count = len(anomalies)
|
| 397 |
+
anomaly_percentage = safe_divide(anomaly_count, total_points)
|
| 398 |
+
|
| 399 |
+
insight = {
|
| 400 |
+
'type': self.get_insight_type(),
|
| 401 |
+
'column': column,
|
| 402 |
+
'method': method,
|
| 403 |
+
'threshold': threshold,
|
| 404 |
+
'statistics': {
|
| 405 |
+
'total_points': total_points,
|
| 406 |
+
'anomaly_count': anomaly_count,
|
| 407 |
+
'anomaly_percentage': anomaly_percentage,
|
| 408 |
+
'mean': data.mean(),
|
| 409 |
+
'median': data.median(),
|
| 410 |
+
'std': data.std(),
|
| 411 |
+
'min': data.min(),
|
| 412 |
+
'max': data.max()
|
| 413 |
+
},
|
| 414 |
+
'anomalies': {
|
| 415 |
+
'values': anomalies.tolist()[:20], # Limit to first 20
|
| 416 |
+
'max_anomaly': anomalies.max() if len(anomalies) > 0 else None,
|
| 417 |
+
'min_anomaly': anomalies.min() if len(anomalies) > 0 else None
|
| 418 |
+
},
|
| 419 |
+
'summary': self._generate_summary(
|
| 420 |
+
column, method, anomaly_count, anomaly_percentage,
|
| 421 |
+
anomalies.max() if len(anomalies) > 0 else None,
|
| 422 |
+
anomalies.min() if len(anomalies) > 0 else None
|
| 423 |
+
)
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
logger.info(f"Generated anomaly detection insight for {column}")
|
| 427 |
+
return insight
|
| 428 |
+
|
| 429 |
+
def _detect_zscore(self, series: pd.Series, threshold: float) -> pd.Series:
|
| 430 |
+
"""Detect anomalies using Z-score method."""
|
| 431 |
+
z_scores = np.abs((series - series.mean()) / series.std())
|
| 432 |
+
return z_scores > threshold
|
| 433 |
+
|
| 434 |
+
def _detect_iqr(self, series: pd.Series, threshold: float) -> pd.Series:
|
| 435 |
+
"""Detect anomalies using IQR method."""
|
| 436 |
+
Q1 = series.quantile(0.25)
|
| 437 |
+
Q3 = series.quantile(0.75)
|
| 438 |
+
IQR = Q3 - Q1
|
| 439 |
+
lower_bound = Q1 - threshold * IQR
|
| 440 |
+
upper_bound = Q3 + threshold * IQR
|
| 441 |
+
return (series < lower_bound) | (series > upper_bound)
|
| 442 |
+
|
| 443 |
+
def _generate_summary(self, column: str, method: str,
|
| 444 |
+
count: int, percentage: float,
|
| 445 |
+
max_anomaly: Optional[float],
|
| 446 |
+
min_anomaly: Optional[float]) -> str:
|
| 447 |
+
"""Generate human-readable summary."""
|
| 448 |
+
if count == 0:
|
| 449 |
+
return f"No anomalies detected in {column} using {method} method."
|
| 450 |
+
|
| 451 |
+
summary = f"Detected {count} anomalies ({format_percentage(percentage)}) in {column}. "
|
| 452 |
+
|
| 453 |
+
if max_anomaly and min_anomaly:
|
| 454 |
+
summary += f"Range of anomalies: {format_number(min_anomaly)} to {format_number(max_anomaly)}."
|
| 455 |
+
|
| 456 |
+
return summary
|
| 457 |
+
|
| 458 |
+
|
| 459 |
+
# ============================================================================
|
| 460 |
+
# DISTRIBUTION INSIGHTS
|
| 461 |
+
# ============================================================================
|
| 462 |
+
|
| 463 |
+
class DistributionInsights(InsightStrategy):
|
| 464 |
+
"""
|
| 465 |
+
Analyze data distribution characteristics.
|
| 466 |
+
Follows Single Responsibility Principle - only handles distribution analysis.
|
| 467 |
+
"""
|
| 468 |
+
|
| 469 |
+
def get_insight_type(self) -> str:
|
| 470 |
+
"""Get insight type."""
|
| 471 |
+
return "distribution_insights"
|
| 472 |
+
|
| 473 |
+
def generate(self, df: pd.DataFrame,
|
| 474 |
+
column: str,
|
| 475 |
+
**kwargs) -> Dict[str, Any]:
|
| 476 |
+
"""
|
| 477 |
+
Generate distribution insights.
|
| 478 |
+
|
| 479 |
+
Args:
|
| 480 |
+
df: DataFrame to analyze
|
| 481 |
+
column: Column to analyze
|
| 482 |
+
**kwargs: Additional parameters
|
| 483 |
+
|
| 484 |
+
Returns:
|
| 485 |
+
Dict with distribution insights
|
| 486 |
+
"""
|
| 487 |
+
# Validate inputs
|
| 488 |
+
DataFrameValidator().validate(df)
|
| 489 |
+
ColumnValidator().validate(df, column)
|
| 490 |
+
|
| 491 |
+
# Check if column is numerical
|
| 492 |
+
if not pd.api.types.is_numeric_dtype(df[column]):
|
| 493 |
+
# For categorical columns
|
| 494 |
+
return self._categorical_distribution(df, column)
|
| 495 |
+
else:
|
| 496 |
+
# For numerical columns
|
| 497 |
+
return self._numerical_distribution(df, column)
|
| 498 |
+
|
| 499 |
+
def _numerical_distribution(self, df: pd.DataFrame, column: str) -> Dict[str, Any]:
|
| 500 |
+
"""Analyze numerical distribution."""
|
| 501 |
+
data = df[column].dropna()
|
| 502 |
+
|
| 503 |
+
if len(data) == 0:
|
| 504 |
+
return {
|
| 505 |
+
'type': self.get_insight_type(),
|
| 506 |
+
'error': 'No valid data',
|
| 507 |
+
'summary': f'No valid data in column {column}.'
|
| 508 |
+
}
|
| 509 |
+
|
| 510 |
+
# Calculate statistics
|
| 511 |
+
statistics = {
|
| 512 |
+
'count': len(data),
|
| 513 |
+
'mean': data.mean(),
|
| 514 |
+
'median': data.median(),
|
| 515 |
+
'mode': data.mode()[0] if len(data.mode()) > 0 else None,
|
| 516 |
+
'std': data.std(),
|
| 517 |
+
'min': data.min(),
|
| 518 |
+
'max': data.max(),
|
| 519 |
+
'range': data.max() - data.min(),
|
| 520 |
+
'q1': data.quantile(0.25),
|
| 521 |
+
'q3': data.quantile(0.75),
|
| 522 |
+
'iqr': data.quantile(0.75) - data.quantile(0.25),
|
| 523 |
+
'skewness': data.skew(),
|
| 524 |
+
'kurtosis': data.kurtosis()
|
| 525 |
+
}
|
| 526 |
+
|
| 527 |
+
# Determine distribution shape
|
| 528 |
+
shape = self._determine_shape(statistics['skewness'], statistics['kurtosis'])
|
| 529 |
+
|
| 530 |
+
insight = {
|
| 531 |
+
'type': self.get_insight_type(),
|
| 532 |
+
'column': column,
|
| 533 |
+
'data_type': 'numerical',
|
| 534 |
+
'statistics': statistics,
|
| 535 |
+
'distribution_shape': shape,
|
| 536 |
+
'summary': self._generate_numerical_summary(column, statistics, shape)
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
logger.info(f"Generated distribution insight for {column}")
|
| 540 |
+
return insight
|
| 541 |
+
|
| 542 |
+
def _categorical_distribution(self, df: pd.DataFrame, column: str) -> Dict[str, Any]:
|
| 543 |
+
"""Analyze categorical distribution."""
|
| 544 |
+
data = df[column].dropna()
|
| 545 |
+
|
| 546 |
+
if len(data) == 0:
|
| 547 |
+
return {
|
| 548 |
+
'type': self.get_insight_type(),
|
| 549 |
+
'error': 'No valid data',
|
| 550 |
+
'summary': f'No valid data in column {column}.'
|
| 551 |
+
}
|
| 552 |
+
|
| 553 |
+
# Calculate statistics
|
| 554 |
+
value_counts = data.value_counts()
|
| 555 |
+
|
| 556 |
+
statistics = {
|
| 557 |
+
'count': len(data),
|
| 558 |
+
'unique_values': data.nunique(),
|
| 559 |
+
'most_common': value_counts.index[0],
|
| 560 |
+
'most_common_count': value_counts.iloc[0],
|
| 561 |
+
'most_common_percentage': safe_divide(value_counts.iloc[0], len(data)),
|
| 562 |
+
'least_common': value_counts.index[-1],
|
| 563 |
+
'least_common_count': value_counts.iloc[-1]
|
| 564 |
+
}
|
| 565 |
+
|
| 566 |
+
insight = {
|
| 567 |
+
'type': self.get_insight_type(),
|
| 568 |
+
'column': column,
|
| 569 |
+
'data_type': 'categorical',
|
| 570 |
+
'statistics': statistics,
|
| 571 |
+
'value_counts': value_counts.head(10).to_dict(),
|
| 572 |
+
'summary': self._generate_categorical_summary(column, statistics)
|
| 573 |
+
}
|
| 574 |
+
|
| 575 |
+
logger.info(f"Generated distribution insight for {column}")
|
| 576 |
+
return insight
|
| 577 |
+
|
| 578 |
+
def _determine_shape(self, skewness: float, kurtosis: float) -> str:
|
| 579 |
+
"""Determine distribution shape from skewness and kurtosis."""
|
| 580 |
+
if abs(skewness) < 0.5 and abs(kurtosis) < 0.5:
|
| 581 |
+
return 'approximately normal'
|
| 582 |
+
elif skewness > 0.5:
|
| 583 |
+
return 'right-skewed (positive skew)'
|
| 584 |
+
elif skewness < -0.5:
|
| 585 |
+
return 'left-skewed (negative skew)'
|
| 586 |
+
elif kurtosis > 1:
|
| 587 |
+
return 'heavy-tailed (leptokurtic)'
|
| 588 |
+
elif kurtosis < -1:
|
| 589 |
+
return 'light-tailed (platykurtic)'
|
| 590 |
+
else:
|
| 591 |
+
return 'mixed characteristics'
|
| 592 |
+
|
| 593 |
+
def _generate_numerical_summary(self, column: str,
|
| 594 |
+
stats: Dict, shape: str) -> str:
|
| 595 |
+
"""Generate summary for numerical distribution."""
|
| 596 |
+
summary = f"{column} has a {shape} distribution. "
|
| 597 |
+
summary += f"Mean: {format_number(stats['mean'])}, "
|
| 598 |
+
summary += f"Median: {format_number(stats['median'])}, "
|
| 599 |
+
summary += f"Std Dev: {format_number(stats['std'])}. "
|
| 600 |
+
summary += f"Range: {format_number(stats['min'])} to {format_number(stats['max'])}."
|
| 601 |
+
return summary
|
| 602 |
+
|
| 603 |
+
def _generate_categorical_summary(self, column: str, stats: Dict) -> str:
|
| 604 |
+
"""Generate summary for categorical distribution."""
|
| 605 |
+
summary = f"{column} has {stats['unique_values']} unique values. "
|
| 606 |
+
summary += f"Most common: '{stats['most_common']}' "
|
| 607 |
+
summary += f"({format_percentage(stats['most_common_percentage'])})."
|
| 608 |
+
return summary
|
| 609 |
+
|
| 610 |
+
|
| 611 |
+
# ============================================================================
|
| 612 |
+
# CORRELATION INSIGHTS
|
| 613 |
+
# ============================================================================
|
| 614 |
+
|
| 615 |
+
class CorrelationInsights(InsightStrategy):
|
| 616 |
+
"""
|
| 617 |
+
Identify strong correlations between variables.
|
| 618 |
+
Follows Single Responsibility Principle - only handles correlation analysis.
|
| 619 |
+
"""
|
| 620 |
+
|
| 621 |
+
def get_insight_type(self) -> str:
|
| 622 |
+
"""Get insight type."""
|
| 623 |
+
return "correlation_insights"
|
| 624 |
+
|
| 625 |
+
def generate(self, df: pd.DataFrame,
|
| 626 |
+
columns: Optional[List[str]] = None,
|
| 627 |
+
threshold: float = 0.7,
|
| 628 |
+
method: str = 'pearson',
|
| 629 |
+
**kwargs) -> Dict[str, Any]:
|
| 630 |
+
"""
|
| 631 |
+
Generate correlation insights.
|
| 632 |
+
|
| 633 |
+
Args:
|
| 634 |
+
df: DataFrame to analyze
|
| 635 |
+
columns: Optional list of columns to analyze
|
| 636 |
+
threshold: Correlation threshold for strong correlations
|
| 637 |
+
method: Correlation method ('pearson', 'spearman', 'kendall')
|
| 638 |
+
**kwargs: Additional parameters
|
| 639 |
+
|
| 640 |
+
Returns:
|
| 641 |
+
Dict with correlation insights
|
| 642 |
+
"""
|
| 643 |
+
# Validate inputs
|
| 644 |
+
DataFrameValidator().validate(df)
|
| 645 |
+
|
| 646 |
+
# Select numerical columns
|
| 647 |
+
if columns:
|
| 648 |
+
ColumnValidator().validate(df, columns)
|
| 649 |
+
df_corr = df[columns].select_dtypes(include=[np.number])
|
| 650 |
+
else:
|
| 651 |
+
df_corr = df.select_dtypes(include=[np.number])
|
| 652 |
+
|
| 653 |
+
if df_corr.shape[1] < 2:
|
| 654 |
+
return {
|
| 655 |
+
'type': self.get_insight_type(),
|
| 656 |
+
'error': 'Insufficient numerical columns',
|
| 657 |
+
'summary': 'Need at least 2 numerical columns for correlation analysis.'
|
| 658 |
+
}
|
| 659 |
+
|
| 660 |
+
# Calculate correlation matrix
|
| 661 |
+
corr_matrix = df_corr.corr(method=method)
|
| 662 |
+
|
| 663 |
+
# Find strong correlations
|
| 664 |
+
strong_correlations = []
|
| 665 |
+
|
| 666 |
+
for i in range(len(corr_matrix.columns)):
|
| 667 |
+
for j in range(i + 1, len(corr_matrix.columns)):
|
| 668 |
+
corr_value = corr_matrix.iloc[i, j]
|
| 669 |
+
|
| 670 |
+
if abs(corr_value) >= threshold:
|
| 671 |
+
strong_correlations.append({
|
| 672 |
+
'variable1': corr_matrix.columns[i],
|
| 673 |
+
'variable2': corr_matrix.columns[j],
|
| 674 |
+
'correlation': corr_value,
|
| 675 |
+
'strength': self._classify_strength(abs(corr_value)),
|
| 676 |
+
'direction': 'positive' if corr_value > 0 else 'negative'
|
| 677 |
+
})
|
| 678 |
+
|
| 679 |
+
# Sort by absolute correlation value
|
| 680 |
+
strong_correlations.sort(key=lambda x: abs(x['correlation']), reverse=True)
|
| 681 |
+
|
| 682 |
+
insight = {
|
| 683 |
+
'type': self.get_insight_type(),
|
| 684 |
+
'method': method,
|
| 685 |
+
'threshold': threshold,
|
| 686 |
+
'total_pairs_analyzed': len(corr_matrix.columns) * (len(corr_matrix.columns) - 1) // 2,
|
| 687 |
+
'strong_correlations_found': len(strong_correlations),
|
| 688 |
+
'correlations': strong_correlations[:10], # Top 10
|
| 689 |
+
'summary': self._generate_summary(strong_correlations, threshold)
|
| 690 |
+
}
|
| 691 |
+
|
| 692 |
+
logger.info(f"Generated correlation insights with {len(strong_correlations)} strong correlations")
|
| 693 |
+
return insight
|
| 694 |
+
|
| 695 |
+
def _classify_strength(self, abs_corr: float) -> str:
|
| 696 |
+
"""Classify correlation strength."""
|
| 697 |
+
if abs_corr >= 0.9:
|
| 698 |
+
return 'very strong'
|
| 699 |
+
elif abs_corr >= 0.7:
|
| 700 |
+
return 'strong'
|
| 701 |
+
elif abs_corr >= 0.5:
|
| 702 |
+
return 'moderate'
|
| 703 |
+
elif abs_corr >= 0.3:
|
| 704 |
+
return 'weak'
|
| 705 |
+
else:
|
| 706 |
+
return 'very weak'
|
| 707 |
+
|
| 708 |
+
def _generate_summary(self, correlations: List[Dict], threshold: float) -> str:
|
| 709 |
+
"""Generate human-readable summary."""
|
| 710 |
+
if len(correlations) == 0:
|
| 711 |
+
return f"No strong correlations (threshold: {threshold}) found."
|
| 712 |
+
|
| 713 |
+
top = correlations[0]
|
| 714 |
+
summary = f"Found {len(correlations)} strong correlations. "
|
| 715 |
+
summary += f"Strongest: {top['variable1']} and {top['variable2']} "
|
| 716 |
+
summary += f"({top['direction']}, {format_number(top['correlation'])})."
|
| 717 |
+
|
| 718 |
+
return summary
|
| 719 |
+
|
| 720 |
+
|
| 721 |
+
# ============================================================================
|
| 722 |
+
# INSIGHT MANAGER
|
| 723 |
+
# Uses Strategy Pattern to manage different insight types
|
| 724 |
+
# ============================================================================
|
| 725 |
+
|
| 726 |
+
class InsightManager:
|
| 727 |
+
"""
|
| 728 |
+
Manager class for insights using Strategy Pattern.
|
| 729 |
+
Follows Open/Closed Principle - open for extension, closed for modification.
|
| 730 |
+
"""
|
| 731 |
+
|
| 732 |
+
def __init__(self):
|
| 733 |
+
"""Initialize InsightManager with all available strategies."""
|
| 734 |
+
self.strategies: Dict[str, InsightStrategy] = {
|
| 735 |
+
'top_bottom': TopBottomPerformers(),
|
| 736 |
+
'trend': TrendAnalysis(),
|
| 737 |
+
'anomaly': AnomalyDetection(),
|
| 738 |
+
'distribution': DistributionInsights(),
|
| 739 |
+
'correlation': CorrelationInsights()
|
| 740 |
+
}
|
| 741 |
+
|
| 742 |
+
def generate_insight(self, insight_type: str, df: pd.DataFrame, **kwargs) -> Dict[str, Any]:
|
| 743 |
+
"""
|
| 744 |
+
Generate insight using specified strategy.
|
| 745 |
+
|
| 746 |
+
Args:
|
| 747 |
+
insight_type: Type of insight to generate
|
| 748 |
+
df: DataFrame to analyze
|
| 749 |
+
**kwargs: Parameters specific to insight type
|
| 750 |
+
|
| 751 |
+
Returns:
|
| 752 |
+
Dict with insight information
|
| 753 |
+
|
| 754 |
+
Raises:
|
| 755 |
+
ValueError: If insight type is not supported
|
| 756 |
+
"""
|
| 757 |
+
if insight_type not in self.strategies:
|
| 758 |
+
raise ValueError(
|
| 759 |
+
f"Unsupported insight type: {insight_type}. "
|
| 760 |
+
f"Available types: {list(self.strategies.keys())}"
|
| 761 |
+
)
|
| 762 |
+
|
| 763 |
+
strategy = self.strategies[insight_type]
|
| 764 |
+
return strategy.generate(df, **kwargs)
|
| 765 |
+
|
| 766 |
+
def generate_all_insights(self, df: pd.DataFrame,
|
| 767 |
+
config: Optional[Dict[str, Dict]] = None) -> Dict[str, Dict[str, Any]]:
|
| 768 |
+
"""
|
| 769 |
+
Generate all available insights.
|
| 770 |
+
|
| 771 |
+
Args:
|
| 772 |
+
df: DataFrame to analyze
|
| 773 |
+
config: Optional configuration for each insight type
|
| 774 |
+
|
| 775 |
+
Returns:
|
| 776 |
+
Dict with all insights
|
| 777 |
+
"""
|
| 778 |
+
all_insights = {}
|
| 779 |
+
|
| 780 |
+
# Get column types
|
| 781 |
+
column_types = get_column_types(df)
|
| 782 |
+
|
| 783 |
+
# Generate insights based on available data
|
| 784 |
+
try:
|
| 785 |
+
# Top/Bottom performers (if numerical columns exist)
|
| 786 |
+
if len(column_types['numerical']) > 0:
|
| 787 |
+
col = column_types['numerical'][0]
|
| 788 |
+
params = config.get('top_bottom', {}) if config else {}
|
| 789 |
+
all_insights['top_bottom'] = self.generate_insight(
|
| 790 |
+
'top_bottom', df, column=col, **params
|
| 791 |
+
)
|
| 792 |
+
except Exception as e:
|
| 793 |
+
logger.warning(f"Could not generate top/bottom insight: {e}")
|
| 794 |
+
|
| 795 |
+
try:
|
| 796 |
+
# Distribution insights
|
| 797 |
+
if len(column_types['numerical']) > 0:
|
| 798 |
+
col = column_types['numerical'][0]
|
| 799 |
+
params = config.get('distribution', {}) if config else {}
|
| 800 |
+
all_insights['distribution'] = self.generate_insight(
|
| 801 |
+
'distribution', df, column=col, **params
|
| 802 |
+
)
|
| 803 |
+
except Exception as e:
|
| 804 |
+
logger.warning(f"Could not generate distribution insight: {e}")
|
| 805 |
+
|
| 806 |
+
try:
|
| 807 |
+
# Anomaly detection
|
| 808 |
+
if len(column_types['numerical']) > 0:
|
| 809 |
+
col = column_types['numerical'][0]
|
| 810 |
+
params = config.get('anomaly', {}) if config else {}
|
| 811 |
+
all_insights['anomaly'] = self.generate_insight(
|
| 812 |
+
'anomaly', df, column=col, **params
|
| 813 |
+
)
|
| 814 |
+
except Exception as e:
|
| 815 |
+
logger.warning(f"Could not generate anomaly insight: {e}")
|
| 816 |
+
|
| 817 |
+
try:
|
| 818 |
+
# Correlation insights
|
| 819 |
+
if len(column_types['numerical']) >= 2:
|
| 820 |
+
params = config.get('correlation', {}) if config else {}
|
| 821 |
+
all_insights['correlation'] = self.generate_insight(
|
| 822 |
+
'correlation', df, **params
|
| 823 |
+
)
|
| 824 |
+
except Exception as e:
|
| 825 |
+
logger.warning(f"Could not generate correlation insight: {e}")
|
| 826 |
+
|
| 827 |
+
try:
|
| 828 |
+
# Trend analysis (if datetime columns exist)
|
| 829 |
+
if len(column_types['datetime']) > 0 and len(column_types['numerical']) > 0:
|
| 830 |
+
date_col = column_types['datetime'][0]
|
| 831 |
+
value_col = column_types['numerical'][0]
|
| 832 |
+
params = config.get('trend', {}) if config else {}
|
| 833 |
+
all_insights['trend'] = self.generate_insight(
|
| 834 |
+
'trend', df, date_column=date_col, value_column=value_col, **params
|
| 835 |
+
)
|
| 836 |
+
except Exception as e:
|
| 837 |
+
logger.warning(f"Could not generate trend insight: {e}")
|
| 838 |
+
|
| 839 |
+
return all_insights
|
| 840 |
+
|
| 841 |
+
def add_strategy(self, name: str, strategy: InsightStrategy) -> None:
|
| 842 |
+
"""
|
| 843 |
+
Add new insight strategy.
|
| 844 |
+
Follows Open/Closed Principle - extend functionality without modifying existing code.
|
| 845 |
+
|
| 846 |
+
Args:
|
| 847 |
+
name: Name for the strategy
|
| 848 |
+
strategy: Insight strategy instance
|
| 849 |
+
"""
|
| 850 |
+
self.strategies[name] = strategy
|
| 851 |
+
logger.info(f"Added new insight strategy: {name}")
|
| 852 |
+
|
| 853 |
+
def get_available_insights(self) -> List[str]:
|
| 854 |
+
"""
|
| 855 |
+
Get list of available insight types.
|
| 856 |
+
|
| 857 |
+
Returns:
|
| 858 |
+
List of insight type names
|
| 859 |
+
"""
|
| 860 |
+
return list(self.strategies.keys())
|
| 861 |
+
|
| 862 |
+
def format_insight_report(self, insights: Dict[str, Dict[str, Any]]) -> str:
|
| 863 |
+
"""
|
| 864 |
+
Format insights into a readable report.
|
| 865 |
+
|
| 866 |
+
Args:
|
| 867 |
+
insights: Dict of insights from generate_all_insights
|
| 868 |
+
|
| 869 |
+
Returns:
|
| 870 |
+
Formatted string report
|
| 871 |
+
"""
|
| 872 |
+
report = "=" * 80 + "\n"
|
| 873 |
+
report += "AUTOMATED INSIGHTS REPORT\n"
|
| 874 |
+
report += "=" * 80 + "\n\n"
|
| 875 |
+
|
| 876 |
+
for insight_name, insight_data in insights.items():
|
| 877 |
+
report += f"\n{insight_name.upper().replace('_', ' ')}\n"
|
| 878 |
+
report += "-" * 80 + "\n"
|
| 879 |
+
|
| 880 |
+
if 'error' in insight_data:
|
| 881 |
+
report += f"Error: {insight_data['error']}\n"
|
| 882 |
+
elif 'summary' in insight_data:
|
| 883 |
+
report += f"{insight_data['summary']}\n"
|
| 884 |
+
|
| 885 |
+
report += "\n"
|
| 886 |
+
|
| 887 |
+
report += "=" * 80 + "\n"
|
| 888 |
+
return report
|
| 889 |
+
|
| 890 |
+
|
| 891 |
+
if __name__ == "__main__":
|
| 892 |
+
# Example usage
|
| 893 |
+
print("Insights module loaded successfully")
|
| 894 |
+
|
| 895 |
+
# Demonstrate available insights
|
| 896 |
+
manager = InsightManager()
|
| 897 |
+
print(f"Available insights: {manager.get_available_insights()}")
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
pandas
|
| 3 |
+
numpy
|
| 4 |
+
matplotlib
|
| 5 |
+
seaborn
|
| 6 |
+
plotly
|
| 7 |
+
openpyxl
|
tests/__pycache__/conftest.cpython-310-pytest-8.4.2.pyc
ADDED
|
Binary file (411 Bytes). View file
|
|
|
tests/__pycache__/test_data_processor.cpython-310-pytest-8.4.2.pyc
ADDED
|
Binary file (33 kB). View file
|
|
|
tests/__pycache__/test_insights.cpython-310-pytest-8.4.2.pyc
ADDED
|
Binary file (29.5 kB). View file
|
|
|
tests/__pycache__/test_utils.cpython-310-pytest-8.4.2.pyc
ADDED
|
Binary file (29.6 kB). View file
|
|
|
tests/__pycache__/test_visualizations.cpython-310-pytest-8.4.2.pyc
ADDED
|
Binary file (26.5 kB). View file
|
|
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
# Add the parent directory to Python path so tests can import modules
|
| 5 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
tests/test_app.py
ADDED
|
File without changes
|
tests/test_data_processor.py
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit Tests for Data Processor Module
|
| 3 |
+
|
| 4 |
+
Comprehensive tests for all data processing functionality including
|
| 5 |
+
Strategy Pattern implementation, data loading, cleaning, and filtering.
|
| 6 |
+
|
| 7 |
+
Author: Craig
|
| 8 |
+
Date: December 2024
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import pytest
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import numpy as np
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
import tempfile
|
| 16 |
+
import os
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
|
| 19 |
+
from data_processor import (
|
| 20 |
+
DataLoadStrategy, CSVLoadStrategy, ExcelLoadStrategy,
|
| 21 |
+
JSONLoadStrategy, ParquetLoadStrategy,
|
| 22 |
+
DataLoader, DataCleaner, DataProfiler, DataFilter, DataProcessor
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# ============================================================================
|
| 27 |
+
# FIXTURES
|
| 28 |
+
# ============================================================================
|
| 29 |
+
|
| 30 |
+
@pytest.fixture
|
| 31 |
+
def sample_dataframe():
|
| 32 |
+
"""Create a sample DataFrame for testing."""
|
| 33 |
+
return pd.DataFrame({
|
| 34 |
+
'id': [1, 2, 3, 4, 5],
|
| 35 |
+
'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
|
| 36 |
+
'age': [25, 30, 35, 40, 45],
|
| 37 |
+
'salary': [50000, 60000, 70000, 80000, 90000],
|
| 38 |
+
'department': ['HR', 'IT', 'IT', 'Finance', 'HR'],
|
| 39 |
+
'hire_date': pd.date_range('2020-01-01', periods=5)
|
| 40 |
+
})
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@pytest.fixture
|
| 44 |
+
def dataframe_with_missing():
|
| 45 |
+
"""Create DataFrame with missing values."""
|
| 46 |
+
return pd.DataFrame({
|
| 47 |
+
'col1': [1, 2, np.nan, 4, 5],
|
| 48 |
+
'col2': ['a', 'b', 'c', np.nan, 'e'],
|
| 49 |
+
'col3': [10.5, np.nan, 30.5, 40.5, 50.5]
|
| 50 |
+
})
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@pytest.fixture
|
| 54 |
+
def dataframe_with_duplicates():
|
| 55 |
+
"""Create DataFrame with duplicate rows."""
|
| 56 |
+
return pd.DataFrame({
|
| 57 |
+
'id': [1, 2, 3, 2, 4],
|
| 58 |
+
'value': ['a', 'b', 'c', 'b', 'd']
|
| 59 |
+
})
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@pytest.fixture
|
| 63 |
+
def temp_csv_file(sample_dataframe):
|
| 64 |
+
"""Create temporary CSV file."""
|
| 65 |
+
temp_path = tempfile.mktemp(suffix='.csv')
|
| 66 |
+
sample_dataframe.to_csv(temp_path, index=False)
|
| 67 |
+
yield temp_path
|
| 68 |
+
if os.path.exists(temp_path):
|
| 69 |
+
os.remove(temp_path)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@pytest.fixture
|
| 73 |
+
def temp_excel_file(sample_dataframe):
|
| 74 |
+
"""Create temporary Excel file."""
|
| 75 |
+
temp_path = tempfile.mktemp(suffix='.xlsx')
|
| 76 |
+
sample_dataframe.to_excel(temp_path, index=False)
|
| 77 |
+
yield temp_path
|
| 78 |
+
if os.path.exists(temp_path):
|
| 79 |
+
os.remove(temp_path)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@pytest.fixture
|
| 83 |
+
def temp_json_file(sample_dataframe):
|
| 84 |
+
"""Create temporary JSON file."""
|
| 85 |
+
temp_path = tempfile.mktemp(suffix='.json')
|
| 86 |
+
# Drop datetime column for JSON compatibility
|
| 87 |
+
df_json = sample_dataframe.drop('hire_date', axis=1)
|
| 88 |
+
df_json.to_json(temp_path)
|
| 89 |
+
yield temp_path
|
| 90 |
+
if os.path.exists(temp_path):
|
| 91 |
+
os.remove(temp_path)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# ============================================================================
|
| 95 |
+
# STRATEGY PATTERN TESTS
|
| 96 |
+
# ============================================================================
|
| 97 |
+
|
| 98 |
+
class TestCSVLoadStrategy:
|
| 99 |
+
"""Test suite for CSVLoadStrategy."""
|
| 100 |
+
|
| 101 |
+
def test_can_handle_csv(self):
|
| 102 |
+
"""Test CSV file detection."""
|
| 103 |
+
strategy = CSVLoadStrategy()
|
| 104 |
+
assert strategy.can_handle('file.csv') is True
|
| 105 |
+
assert strategy.can_handle('file.CSV') is True
|
| 106 |
+
assert strategy.can_handle('file.xlsx') is False
|
| 107 |
+
|
| 108 |
+
def test_load_csv(self, temp_csv_file):
|
| 109 |
+
"""Test loading CSV file."""
|
| 110 |
+
strategy = CSVLoadStrategy()
|
| 111 |
+
df = strategy.load(temp_csv_file)
|
| 112 |
+
assert isinstance(df, pd.DataFrame)
|
| 113 |
+
assert len(df) > 0
|
| 114 |
+
|
| 115 |
+
def test_load_nonexistent_csv(self):
|
| 116 |
+
"""Test loading non-existent CSV file."""
|
| 117 |
+
strategy = CSVLoadStrategy()
|
| 118 |
+
with pytest.raises(Exception):
|
| 119 |
+
strategy.load('nonexistent.csv')
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class TestExcelLoadStrategy:
|
| 123 |
+
"""Test suite for ExcelLoadStrategy."""
|
| 124 |
+
|
| 125 |
+
def test_can_handle_excel(self):
|
| 126 |
+
"""Test Excel file detection."""
|
| 127 |
+
strategy = ExcelLoadStrategy()
|
| 128 |
+
assert strategy.can_handle('file.xlsx') is True
|
| 129 |
+
assert strategy.can_handle('file.xls') is True
|
| 130 |
+
assert strategy.can_handle('file.XLSX') is True
|
| 131 |
+
assert strategy.can_handle('file.csv') is False
|
| 132 |
+
|
| 133 |
+
def test_load_excel(self, temp_excel_file):
|
| 134 |
+
"""Test loading Excel file."""
|
| 135 |
+
strategy = ExcelLoadStrategy()
|
| 136 |
+
df = strategy.load(temp_excel_file)
|
| 137 |
+
assert isinstance(df, pd.DataFrame)
|
| 138 |
+
assert len(df) > 0
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
class TestJSONLoadStrategy:
|
| 142 |
+
"""Test suite for JSONLoadStrategy."""
|
| 143 |
+
|
| 144 |
+
def test_can_handle_json(self):
|
| 145 |
+
"""Test JSON file detection."""
|
| 146 |
+
strategy = JSONLoadStrategy()
|
| 147 |
+
assert strategy.can_handle('file.json') is True
|
| 148 |
+
assert strategy.can_handle('file.JSON') is True
|
| 149 |
+
assert strategy.can_handle('file.csv') is False
|
| 150 |
+
|
| 151 |
+
def test_load_json(self, temp_json_file):
|
| 152 |
+
"""Test loading JSON file."""
|
| 153 |
+
strategy = JSONLoadStrategy()
|
| 154 |
+
df = strategy.load(temp_json_file)
|
| 155 |
+
assert isinstance(df, pd.DataFrame)
|
| 156 |
+
assert len(df) > 0
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
class TestParquetLoadStrategy:
|
| 160 |
+
"""Test suite for ParquetLoadStrategy."""
|
| 161 |
+
|
| 162 |
+
def test_can_handle_parquet(self):
|
| 163 |
+
"""Test Parquet file detection."""
|
| 164 |
+
strategy = ParquetLoadStrategy()
|
| 165 |
+
assert strategy.can_handle('file.parquet') is True
|
| 166 |
+
assert strategy.can_handle('file.PARQUET') is True
|
| 167 |
+
assert strategy.can_handle('file.csv') is False
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# ============================================================================
|
| 171 |
+
# DATA LOADER TESTS
|
| 172 |
+
# ============================================================================
|
| 173 |
+
|
| 174 |
+
class TestDataLoader:
|
| 175 |
+
"""Test suite for DataLoader class."""
|
| 176 |
+
|
| 177 |
+
def test_initialization(self):
|
| 178 |
+
"""Test DataLoader initialization."""
|
| 179 |
+
loader = DataLoader()
|
| 180 |
+
assert len(loader.strategies) >= 4
|
| 181 |
+
|
| 182 |
+
def test_load_csv(self, temp_csv_file):
|
| 183 |
+
"""Test loading CSV through DataLoader."""
|
| 184 |
+
loader = DataLoader()
|
| 185 |
+
df = loader.load_data(temp_csv_file)
|
| 186 |
+
assert isinstance(df, pd.DataFrame)
|
| 187 |
+
assert len(df) == 5
|
| 188 |
+
|
| 189 |
+
def test_load_excel(self, temp_excel_file):
|
| 190 |
+
"""Test loading Excel through DataLoader."""
|
| 191 |
+
loader = DataLoader()
|
| 192 |
+
df = loader.load_data(temp_excel_file)
|
| 193 |
+
assert isinstance(df, pd.DataFrame)
|
| 194 |
+
assert len(df) == 5
|
| 195 |
+
|
| 196 |
+
def test_load_json(self, temp_json_file):
|
| 197 |
+
"""Test loading JSON through DataLoader."""
|
| 198 |
+
loader = DataLoader()
|
| 199 |
+
df = loader.load_data(temp_json_file)
|
| 200 |
+
assert isinstance(df, pd.DataFrame)
|
| 201 |
+
|
| 202 |
+
def test_load_nonexistent_file(self):
|
| 203 |
+
"""Test loading non-existent file."""
|
| 204 |
+
loader = DataLoader()
|
| 205 |
+
with pytest.raises(FileNotFoundError):
|
| 206 |
+
loader.load_data('nonexistent.csv')
|
| 207 |
+
|
| 208 |
+
def test_add_strategy(self):
|
| 209 |
+
"""Test adding new strategy."""
|
| 210 |
+
loader = DataLoader()
|
| 211 |
+
initial_count = len(loader.strategies)
|
| 212 |
+
|
| 213 |
+
# Create mock strategy
|
| 214 |
+
class MockStrategy(DataLoadStrategy):
|
| 215 |
+
def can_handle(self, filepath):
|
| 216 |
+
return False
|
| 217 |
+
|
| 218 |
+
def load(self, filepath):
|
| 219 |
+
return pd.DataFrame()
|
| 220 |
+
|
| 221 |
+
loader.add_strategy(MockStrategy())
|
| 222 |
+
assert len(loader.strategies) == initial_count + 1
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
# ============================================================================
|
| 226 |
+
# DATA CLEANER TESTS
|
| 227 |
+
# ============================================================================
|
| 228 |
+
|
| 229 |
+
class TestDataCleaner:
|
| 230 |
+
"""Test suite for DataCleaner class."""
|
| 231 |
+
|
| 232 |
+
def test_handle_missing_none(self, dataframe_with_missing):
|
| 233 |
+
"""Test 'none' strategy - no changes."""
|
| 234 |
+
df_cleaned = DataCleaner.handle_missing_values(dataframe_with_missing, strategy='none')
|
| 235 |
+
assert df_cleaned.isnull().sum().sum() == dataframe_with_missing.isnull().sum().sum()
|
| 236 |
+
|
| 237 |
+
def test_handle_missing_drop(self, dataframe_with_missing):
|
| 238 |
+
"""Test dropping rows with missing values."""
|
| 239 |
+
df_cleaned = DataCleaner.handle_missing_values(dataframe_with_missing, strategy='drop')
|
| 240 |
+
assert df_cleaned.isnull().sum().sum() == 0
|
| 241 |
+
assert len(df_cleaned) < len(dataframe_with_missing)
|
| 242 |
+
|
| 243 |
+
def test_handle_missing_fill_mean(self, dataframe_with_missing):
|
| 244 |
+
"""Test filling with mean."""
|
| 245 |
+
df_cleaned = DataCleaner.handle_missing_values(dataframe_with_missing, strategy='fill_mean')
|
| 246 |
+
numerical_cols = df_cleaned.select_dtypes(include=[np.number]).columns
|
| 247 |
+
for col in numerical_cols:
|
| 248 |
+
assert df_cleaned[col].isnull().sum() == 0
|
| 249 |
+
|
| 250 |
+
def test_handle_missing_fill_median(self, dataframe_with_missing):
|
| 251 |
+
"""Test filling with median."""
|
| 252 |
+
df_cleaned = DataCleaner.handle_missing_values(dataframe_with_missing, strategy='fill_median')
|
| 253 |
+
numerical_cols = df_cleaned.select_dtypes(include=[np.number]).columns
|
| 254 |
+
for col in numerical_cols:
|
| 255 |
+
assert df_cleaned[col].isnull().sum() == 0
|
| 256 |
+
|
| 257 |
+
def test_handle_missing_fill_mode(self, dataframe_with_missing):
|
| 258 |
+
"""Test filling with mode."""
|
| 259 |
+
df_cleaned = DataCleaner.handle_missing_values(dataframe_with_missing, strategy='fill_mode')
|
| 260 |
+
# Check categorical columns are filled
|
| 261 |
+
assert df_cleaned['col2'].isnull().sum() == 0
|
| 262 |
+
|
| 263 |
+
def test_convert_data_types(self):
|
| 264 |
+
"""Test automatic data type conversion."""
|
| 265 |
+
df = pd.DataFrame({
|
| 266 |
+
'price': ['$100', '$200', '$300'],
|
| 267 |
+
'date': ['2024-01-01', '2024-01-02', '2024-01-03'],
|
| 268 |
+
'bool_col': ['TRUE', 'FALSE', 'TRUE']
|
| 269 |
+
})
|
| 270 |
+
|
| 271 |
+
df_converted = DataCleaner.convert_data_types(df)
|
| 272 |
+
|
| 273 |
+
# Check currency conversion
|
| 274 |
+
assert pd.api.types.is_numeric_dtype(df_converted['price'])
|
| 275 |
+
|
| 276 |
+
# Check date conversion
|
| 277 |
+
assert pd.api.types.is_datetime64_any_dtype(df_converted['date'])
|
| 278 |
+
|
| 279 |
+
def test_remove_duplicates(self, dataframe_with_duplicates):
|
| 280 |
+
"""Test removing duplicate rows."""
|
| 281 |
+
df_cleaned = DataCleaner.remove_duplicates(dataframe_with_duplicates)
|
| 282 |
+
assert len(df_cleaned) < len(dataframe_with_duplicates)
|
| 283 |
+
assert df_cleaned.duplicated().sum() == 0
|
| 284 |
+
|
| 285 |
+
def test_handle_outliers_zscore(self):
|
| 286 |
+
"""Test outlier removal using z-score."""
|
| 287 |
+
df = pd.DataFrame({
|
| 288 |
+
'values': [1, 2, 3, 4, 5, 100] # 100 is an outlier
|
| 289 |
+
})
|
| 290 |
+
|
| 291 |
+
df_cleaned = DataCleaner.handle_outliers(df, ['values'], method='zscore', threshold=2.0)
|
| 292 |
+
assert len(df_cleaned) < len(df)
|
| 293 |
+
assert 100 not in df_cleaned['values'].values
|
| 294 |
+
|
| 295 |
+
def test_handle_outliers_iqr(self):
|
| 296 |
+
"""Test outlier removal using IQR."""
|
| 297 |
+
df = pd.DataFrame({
|
| 298 |
+
'values': [1, 2, 3, 4, 5, 100] # 100 is an outlier
|
| 299 |
+
})
|
| 300 |
+
|
| 301 |
+
df_cleaned = DataCleaner.handle_outliers(df, ['values'], method='iqr', threshold=1.5)
|
| 302 |
+
assert len(df_cleaned) < len(df)
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
# ============================================================================
|
| 306 |
+
# DATA PROFILER TESTS
|
| 307 |
+
# ============================================================================
|
| 308 |
+
|
| 309 |
+
class TestDataProfiler:
|
| 310 |
+
"""Test suite for DataProfiler class."""
|
| 311 |
+
|
| 312 |
+
def test_initialization(self, sample_dataframe):
|
| 313 |
+
"""Test DataProfiler initialization."""
|
| 314 |
+
profiler = DataProfiler(sample_dataframe)
|
| 315 |
+
assert profiler.df is not None
|
| 316 |
+
|
| 317 |
+
def test_initialization_empty_dataframe(self):
|
| 318 |
+
"""Test initialization with empty DataFrame."""
|
| 319 |
+
with pytest.raises(ValueError):
|
| 320 |
+
DataProfiler(pd.DataFrame())
|
| 321 |
+
|
| 322 |
+
def test_get_basic_info(self, sample_dataframe):
|
| 323 |
+
"""Test getting basic info."""
|
| 324 |
+
profiler = DataProfiler(sample_dataframe)
|
| 325 |
+
info = profiler.get_basic_info()
|
| 326 |
+
|
| 327 |
+
assert info['rows'] == 5
|
| 328 |
+
assert info['columns'] == 6
|
| 329 |
+
assert 'column_names' in info
|
| 330 |
+
assert 'data_types' in info
|
| 331 |
+
assert 'memory_usage' in info
|
| 332 |
+
|
| 333 |
+
def test_get_missing_values_report(self, dataframe_with_missing):
|
| 334 |
+
"""Test missing values report."""
|
| 335 |
+
profiler = DataProfiler(dataframe_with_missing)
|
| 336 |
+
report = profiler.get_missing_values_report()
|
| 337 |
+
|
| 338 |
+
assert isinstance(report, pd.DataFrame)
|
| 339 |
+
assert len(report) > 0
|
| 340 |
+
assert 'Missing_Count' in report.columns
|
| 341 |
+
assert 'Missing_Percentage' in report.columns
|
| 342 |
+
|
| 343 |
+
def test_get_numerical_summary(self, sample_dataframe):
|
| 344 |
+
"""Test numerical summary statistics."""
|
| 345 |
+
profiler = DataProfiler(sample_dataframe)
|
| 346 |
+
summary = profiler.get_numerical_summary()
|
| 347 |
+
|
| 348 |
+
assert isinstance(summary, pd.DataFrame)
|
| 349 |
+
assert 'age' in summary.columns
|
| 350 |
+
assert 'salary' in summary.columns
|
| 351 |
+
|
| 352 |
+
def test_get_categorical_summary(self, sample_dataframe):
|
| 353 |
+
"""Test categorical summary statistics."""
|
| 354 |
+
profiler = DataProfiler(sample_dataframe)
|
| 355 |
+
summary = profiler.get_categorical_summary()
|
| 356 |
+
|
| 357 |
+
assert isinstance(summary, dict)
|
| 358 |
+
assert 'department' in summary
|
| 359 |
+
assert 'unique_count' in summary['department']
|
| 360 |
+
assert 'top_value' in summary['department']
|
| 361 |
+
|
| 362 |
+
def test_get_correlation_matrix(self, sample_dataframe):
|
| 363 |
+
"""Test correlation matrix generation."""
|
| 364 |
+
profiler = DataProfiler(sample_dataframe)
|
| 365 |
+
corr_matrix = profiler.get_correlation_matrix()
|
| 366 |
+
|
| 367 |
+
assert isinstance(corr_matrix, pd.DataFrame)
|
| 368 |
+
assert 'age' in corr_matrix.columns
|
| 369 |
+
assert 'salary' in corr_matrix.columns
|
| 370 |
+
|
| 371 |
+
def test_get_full_profile(self, sample_dataframe):
|
| 372 |
+
"""Test full profile generation."""
|
| 373 |
+
profiler = DataProfiler(sample_dataframe)
|
| 374 |
+
profile = profiler.get_full_profile()
|
| 375 |
+
|
| 376 |
+
assert 'basic_info' in profile
|
| 377 |
+
assert 'missing_values' in profile
|
| 378 |
+
assert 'numerical_summary' in profile
|
| 379 |
+
assert 'categorical_summary' in profile
|
| 380 |
+
assert 'correlation_matrix' in profile
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
# ============================================================================
|
| 384 |
+
# DATA FILTER TESTS
|
| 385 |
+
# ============================================================================
|
| 386 |
+
|
| 387 |
+
class TestDataFilter:
|
| 388 |
+
"""Test suite for DataFilter class."""
|
| 389 |
+
|
| 390 |
+
def test_filter_numerical_min(self, sample_dataframe):
|
| 391 |
+
"""Test filtering with minimum value."""
|
| 392 |
+
filtered = DataFilter.filter_numerical(sample_dataframe, 'age', min_val=30)
|
| 393 |
+
assert len(filtered) == 4
|
| 394 |
+
assert filtered['age'].min() >= 30
|
| 395 |
+
|
| 396 |
+
def test_filter_numerical_max(self, sample_dataframe):
|
| 397 |
+
"""Test filtering with maximum value."""
|
| 398 |
+
filtered = DataFilter.filter_numerical(sample_dataframe, 'age', max_val=35)
|
| 399 |
+
assert len(filtered) == 3
|
| 400 |
+
assert filtered['age'].max() <= 35
|
| 401 |
+
|
| 402 |
+
def test_filter_numerical_range(self, sample_dataframe):
|
| 403 |
+
"""Test filtering with range."""
|
| 404 |
+
filtered = DataFilter.filter_numerical(sample_dataframe, 'age', min_val=30, max_val=40)
|
| 405 |
+
assert len(filtered) == 3
|
| 406 |
+
assert filtered['age'].min() >= 30
|
| 407 |
+
assert filtered['age'].max() <= 40
|
| 408 |
+
|
| 409 |
+
def test_filter_categorical(self, sample_dataframe):
|
| 410 |
+
"""Test categorical filtering."""
|
| 411 |
+
filtered = DataFilter.filter_categorical(sample_dataframe, 'department', ['IT', 'HR'])
|
| 412 |
+
assert len(filtered) == 4
|
| 413 |
+
assert all(filtered['department'].isin(['IT', 'HR']))
|
| 414 |
+
|
| 415 |
+
def test_filter_categorical_empty_values(self, sample_dataframe):
|
| 416 |
+
"""Test categorical filtering with empty values list."""
|
| 417 |
+
filtered = DataFilter.filter_categorical(sample_dataframe, 'department', [])
|
| 418 |
+
assert len(filtered) == len(sample_dataframe)
|
| 419 |
+
|
| 420 |
+
def test_filter_date_range(self, sample_dataframe):
|
| 421 |
+
"""Test date range filtering."""
|
| 422 |
+
start_date = pd.Timestamp('2020-01-02')
|
| 423 |
+
end_date = pd.Timestamp('2020-01-04')
|
| 424 |
+
|
| 425 |
+
filtered = DataFilter.filter_date_range(sample_dataframe, 'hire_date', start_date, end_date)
|
| 426 |
+
assert len(filtered) == 3
|
| 427 |
+
|
| 428 |
+
def test_apply_multiple_filters(self, sample_dataframe):
|
| 429 |
+
"""Test applying multiple filters."""
|
| 430 |
+
filters = [
|
| 431 |
+
{'type': 'numerical', 'column': 'age', 'min_val': 30, 'max_val': 40},
|
| 432 |
+
{'type': 'categorical', 'column': 'department', 'values': ['IT', 'Finance']}
|
| 433 |
+
]
|
| 434 |
+
|
| 435 |
+
filtered = DataFilter.apply_multiple_filters(sample_dataframe, filters)
|
| 436 |
+
assert len(filtered) <= len(sample_dataframe)
|
| 437 |
+
|
| 438 |
+
def test_filter_invalid_column(self, sample_dataframe):
|
| 439 |
+
"""Test filtering with invalid column."""
|
| 440 |
+
with pytest.raises(ValueError):
|
| 441 |
+
DataFilter.filter_numerical(sample_dataframe, 'nonexistent', min_val=0)
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
# ============================================================================
|
| 445 |
+
# DATA PROCESSOR TESTS (Facade)
|
| 446 |
+
# ============================================================================
|
| 447 |
+
|
| 448 |
+
class TestDataProcessor:
|
| 449 |
+
"""Test suite for DataProcessor class (Facade)."""
|
| 450 |
+
|
| 451 |
+
def test_initialization(self):
|
| 452 |
+
"""Test DataProcessor initialization."""
|
| 453 |
+
processor = DataProcessor()
|
tests/test_insights.py
ADDED
|
@@ -0,0 +1,554 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit Tests for Insights Module
|
| 3 |
+
|
| 4 |
+
Comprehensive tests for all insight strategies and the insight manager.
|
| 5 |
+
|
| 6 |
+
Author: Craig
|
| 7 |
+
Date: December 2024
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import pytest
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import numpy as np
|
| 13 |
+
from datetime import datetime, timedelta
|
| 14 |
+
|
| 15 |
+
from insights import (
|
| 16 |
+
InsightStrategy, TopBottomPerformers, TrendAnalysis,
|
| 17 |
+
AnomalyDetection, DistributionInsights, CorrelationInsights,
|
| 18 |
+
InsightManager
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# ============================================================================
|
| 23 |
+
# FIXTURES
|
| 24 |
+
# ============================================================================
|
| 25 |
+
|
| 26 |
+
@pytest.fixture
|
| 27 |
+
def sales_data():
|
| 28 |
+
"""Create sample sales data."""
|
| 29 |
+
return pd.DataFrame({
|
| 30 |
+
'product': ['A', 'B', 'C', 'D', 'E'] * 20,
|
| 31 |
+
'sales': np.random.randint(100, 1000, 100),
|
| 32 |
+
'revenue': np.random.uniform(1000, 5000, 100),
|
| 33 |
+
'region': np.random.choice(['North', 'South', 'East', 'West'], 100)
|
| 34 |
+
})
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@pytest.fixture
|
| 38 |
+
def time_series_data():
|
| 39 |
+
"""Create sample time series data."""
|
| 40 |
+
dates = pd.date_range('2024-01-01', periods=100, freq='D')
|
| 41 |
+
values = np.cumsum(np.random.randn(100)) + 100 # Random walk with trend
|
| 42 |
+
return pd.DataFrame({
|
| 43 |
+
'date': dates,
|
| 44 |
+
'value': values,
|
| 45 |
+
'sales': np.random.randint(50, 200, 100)
|
| 46 |
+
})
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@pytest.fixture
|
| 50 |
+
def anomaly_data():
|
| 51 |
+
"""Create data with anomalies."""
|
| 52 |
+
# Normal data with a few outliers
|
| 53 |
+
normal = np.random.normal(100, 10, 95)
|
| 54 |
+
outliers = np.array([200, 10, 250, 5, 220])
|
| 55 |
+
data = np.concatenate([normal, outliers])
|
| 56 |
+
np.random.shuffle(data)
|
| 57 |
+
|
| 58 |
+
return pd.DataFrame({
|
| 59 |
+
'values': data,
|
| 60 |
+
'category': np.random.choice(['A', 'B', 'C'], 100)
|
| 61 |
+
})
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@pytest.fixture
|
| 65 |
+
def correlation_data():
|
| 66 |
+
"""Create data with correlations."""
|
| 67 |
+
np.random.seed(42)
|
| 68 |
+
x = np.random.normal(50, 10, 100)
|
| 69 |
+
y = 2 * x + np.random.normal(0, 5, 100) # Strong positive correlation
|
| 70 |
+
z = -1.5 * x + np.random.normal(0, 8, 100) # Strong negative correlation
|
| 71 |
+
w = np.random.normal(100, 15, 100) # No correlation
|
| 72 |
+
|
| 73 |
+
return pd.DataFrame({
|
| 74 |
+
'var_x': x,
|
| 75 |
+
'var_y': y,
|
| 76 |
+
'var_z': z,
|
| 77 |
+
'var_w': w
|
| 78 |
+
})
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
@pytest.fixture
|
| 82 |
+
def mixed_data():
|
| 83 |
+
"""Create data with mixed types."""
|
| 84 |
+
return pd.DataFrame({
|
| 85 |
+
'numerical': np.random.normal(100, 15, 100),
|
| 86 |
+
'categorical': np.random.choice(['Cat1', 'Cat2', 'Cat3'], 100),
|
| 87 |
+
'date': pd.date_range('2024-01-01', periods=100),
|
| 88 |
+
'sales': np.random.randint(50, 500, 100)
|
| 89 |
+
})
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# ============================================================================
|
| 93 |
+
# TOP/BOTTOM PERFORMERS TESTS
|
| 94 |
+
# ============================================================================
|
| 95 |
+
|
| 96 |
+
class TestTopBottomPerformers:
|
| 97 |
+
"""Test suite for TopBottomPerformers class."""
|
| 98 |
+
|
| 99 |
+
def test_initialization(self):
|
| 100 |
+
"""Test TopBottomPerformers initialization."""
|
| 101 |
+
insight = TopBottomPerformers()
|
| 102 |
+
assert insight is not None
|
| 103 |
+
|
| 104 |
+
def test_get_insight_type(self):
|
| 105 |
+
"""Test getting insight type."""
|
| 106 |
+
insight = TopBottomPerformers()
|
| 107 |
+
assert insight.get_insight_type() == "top_bottom_performers"
|
| 108 |
+
|
| 109 |
+
def test_generate_simple(self, sales_data):
|
| 110 |
+
"""Test generating simple top/bottom insights."""
|
| 111 |
+
insight = TopBottomPerformers()
|
| 112 |
+
result = insight.generate(sales_data, column='sales')
|
| 113 |
+
|
| 114 |
+
assert result['type'] == 'top_bottom_performers'
|
| 115 |
+
assert 'top_performers' in result
|
| 116 |
+
assert 'bottom_performers' in result
|
| 117 |
+
assert 'summary' in result
|
| 118 |
+
|
| 119 |
+
def test_generate_with_groupby(self, sales_data):
|
| 120 |
+
"""Test generating insights with groupby."""
|
| 121 |
+
insight = TopBottomPerformers()
|
| 122 |
+
result = insight.generate(
|
| 123 |
+
sales_data,
|
| 124 |
+
column='sales',
|
| 125 |
+
group_by='product',
|
| 126 |
+
aggregation='sum'
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
assert result['group_by'] == 'product'
|
| 130 |
+
assert result['aggregation'] == 'sum'
|
| 131 |
+
assert len(result['top_performers']['data']) > 0
|
| 132 |
+
|
| 133 |
+
def test_generate_with_custom_n(self, sales_data):
|
| 134 |
+
"""Test with custom top_n and bottom_n."""
|
| 135 |
+
insight = TopBottomPerformers()
|
| 136 |
+
result = insight.generate(
|
| 137 |
+
sales_data,
|
| 138 |
+
column='sales',
|
| 139 |
+
top_n=3,
|
| 140 |
+
bottom_n=3
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
assert result['top_performers']['count'] <= 3
|
| 144 |
+
assert result['bottom_performers']['count'] <= 3
|
| 145 |
+
|
| 146 |
+
def test_invalid_column(self, sales_data):
|
| 147 |
+
"""Test with invalid column."""
|
| 148 |
+
insight = TopBottomPerformers()
|
| 149 |
+
with pytest.raises(ValueError):
|
| 150 |
+
insight.generate(sales_data, column='nonexistent')
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
# ============================================================================
|
| 154 |
+
# TREND ANALYSIS TESTS
|
| 155 |
+
# ============================================================================
|
| 156 |
+
|
| 157 |
+
class TestTrendAnalysis:
|
| 158 |
+
"""Test suite for TrendAnalysis class."""
|
| 159 |
+
|
| 160 |
+
def test_initialization(self):
|
| 161 |
+
"""Test TrendAnalysis initialization."""
|
| 162 |
+
insight = TrendAnalysis()
|
| 163 |
+
assert insight is not None
|
| 164 |
+
|
| 165 |
+
def test_get_insight_type(self):
|
| 166 |
+
"""Test getting insight type."""
|
| 167 |
+
insight = TrendAnalysis()
|
| 168 |
+
assert insight.get_insight_type() == "trend_analysis"
|
| 169 |
+
|
| 170 |
+
def test_generate_trend(self, time_series_data):
|
| 171 |
+
"""Test generating trend insights."""
|
| 172 |
+
insight = TrendAnalysis()
|
| 173 |
+
result = insight.generate(
|
| 174 |
+
time_series_data,
|
| 175 |
+
date_column='date',
|
| 176 |
+
value_column='value'
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
assert result['type'] == 'trend_analysis'
|
| 180 |
+
assert 'trend_direction' in result
|
| 181 |
+
assert 'metrics' in result
|
| 182 |
+
assert 'date_range' in result
|
| 183 |
+
assert 'summary' in result
|
| 184 |
+
|
| 185 |
+
def test_trend_metrics(self, time_series_data):
|
| 186 |
+
"""Test trend metrics calculation."""
|
| 187 |
+
insight = TrendAnalysis()
|
| 188 |
+
result = insight.generate(
|
| 189 |
+
time_series_data,
|
| 190 |
+
date_column='date',
|
| 191 |
+
value_column='value'
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
metrics = result['metrics']
|
| 195 |
+
assert 'first_value' in metrics
|
| 196 |
+
assert 'last_value' in metrics
|
| 197 |
+
assert 'absolute_change' in metrics
|
| 198 |
+
assert 'percentage_change' in metrics
|
| 199 |
+
assert 'growth_rate' in metrics
|
| 200 |
+
assert 'volatility' in metrics
|
| 201 |
+
|
| 202 |
+
def test_insufficient_data(self):
|
| 203 |
+
"""Test with insufficient data."""
|
| 204 |
+
df = pd.DataFrame({
|
| 205 |
+
'date': [pd.Timestamp('2024-01-01')],
|
| 206 |
+
'value': [100]
|
| 207 |
+
})
|
| 208 |
+
|
| 209 |
+
insight = TrendAnalysis()
|
| 210 |
+
result = insight.generate(df, date_column='date', value_column='value')
|
| 211 |
+
|
| 212 |
+
assert 'error' in result
|
| 213 |
+
|
| 214 |
+
def test_invalid_columns(self, time_series_data):
|
| 215 |
+
"""Test with invalid columns."""
|
| 216 |
+
insight = TrendAnalysis()
|
| 217 |
+
with pytest.raises(ValueError):
|
| 218 |
+
insight.generate(
|
| 219 |
+
time_series_data,
|
| 220 |
+
date_column='nonexistent',
|
| 221 |
+
value_column='value'
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
# ============================================================================
|
| 226 |
+
# ANOMALY DETECTION TESTS
|
| 227 |
+
# ============================================================================
|
| 228 |
+
|
| 229 |
+
class TestAnomalyDetection:
|
| 230 |
+
"""Test suite for AnomalyDetection class."""
|
| 231 |
+
|
| 232 |
+
def test_initialization(self):
|
| 233 |
+
"""Test AnomalyDetection initialization."""
|
| 234 |
+
insight = AnomalyDetection()
|
| 235 |
+
assert insight is not None
|
| 236 |
+
|
| 237 |
+
def test_get_insight_type(self):
|
| 238 |
+
"""Test getting insight type."""
|
| 239 |
+
insight = AnomalyDetection()
|
| 240 |
+
assert insight.get_insight_type() == "anomaly_detection"
|
| 241 |
+
|
| 242 |
+
def test_detect_zscore(self, anomaly_data):
|
| 243 |
+
"""Test Z-score anomaly detection."""
|
| 244 |
+
insight = AnomalyDetection()
|
| 245 |
+
result = insight.generate(
|
| 246 |
+
anomaly_data,
|
| 247 |
+
column='values',
|
| 248 |
+
method='zscore',
|
| 249 |
+
threshold=2.5
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
assert result['type'] == 'anomaly_detection'
|
| 253 |
+
assert result['method'] == 'zscore'
|
| 254 |
+
assert 'statistics' in result
|
| 255 |
+
assert 'anomalies' in result
|
| 256 |
+
|
| 257 |
+
def test_detect_iqr(self, anomaly_data):
|
| 258 |
+
"""Test IQR anomaly detection."""
|
| 259 |
+
insight = AnomalyDetection()
|
| 260 |
+
result = insight.generate(
|
| 261 |
+
anomaly_data,
|
| 262 |
+
column='values',
|
| 263 |
+
method='iqr',
|
| 264 |
+
threshold=1.5
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
assert result['method'] == 'iqr'
|
| 268 |
+
assert result['statistics']['anomaly_count'] >= 0
|
| 269 |
+
|
| 270 |
+
def test_no_anomalies(self):
|
| 271 |
+
"""Test when no anomalies are found."""
|
| 272 |
+
df = pd.DataFrame({
|
| 273 |
+
'values': np.random.normal(100, 1, 100) # Very tight distribution
|
| 274 |
+
})
|
| 275 |
+
|
| 276 |
+
insight = AnomalyDetection()
|
| 277 |
+
result = insight.generate(df, column='values', threshold=10)
|
| 278 |
+
|
| 279 |
+
assert result['statistics']['anomaly_count'] == 0
|
| 280 |
+
|
| 281 |
+
def test_non_numerical_column(self, sales_data):
|
| 282 |
+
"""Test with non-numerical column."""
|
| 283 |
+
insight = AnomalyDetection()
|
| 284 |
+
result = insight.generate(sales_data, column='product')
|
| 285 |
+
|
| 286 |
+
assert 'error' in result
|
| 287 |
+
|
| 288 |
+
def test_invalid_method(self, anomaly_data):
|
| 289 |
+
"""Test with invalid method."""
|
| 290 |
+
insight = AnomalyDetection()
|
| 291 |
+
with pytest.raises(ValueError):
|
| 292 |
+
insight.generate(anomaly_data, column='values', method='invalid')
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
# ============================================================================
|
| 296 |
+
# DISTRIBUTION INSIGHTS TESTS
|
| 297 |
+
# ============================================================================
|
| 298 |
+
|
| 299 |
+
class TestDistributionInsights:
|
| 300 |
+
"""Test suite for DistributionInsights class."""
|
| 301 |
+
|
| 302 |
+
def test_initialization(self):
|
| 303 |
+
"""Test DistributionInsights initialization."""
|
| 304 |
+
insight = DistributionInsights()
|
| 305 |
+
assert insight is not None
|
| 306 |
+
|
| 307 |
+
def test_get_insight_type(self):
|
| 308 |
+
"""Test getting insight type."""
|
| 309 |
+
insight = DistributionInsights()
|
| 310 |
+
assert insight.get_insight_type() == "distribution_insights"
|
| 311 |
+
|
| 312 |
+
def test_numerical_distribution(self, sales_data):
|
| 313 |
+
"""Test numerical distribution analysis."""
|
| 314 |
+
insight = DistributionInsights()
|
| 315 |
+
result = insight.generate(sales_data, column='sales')
|
| 316 |
+
|
| 317 |
+
assert result['type'] == 'distribution_insights'
|
| 318 |
+
assert result['data_type'] == 'numerical'
|
| 319 |
+
assert 'statistics' in result
|
| 320 |
+
assert 'distribution_shape' in result
|
| 321 |
+
|
| 322 |
+
def test_numerical_statistics(self, sales_data):
|
| 323 |
+
"""Test numerical statistics calculation."""
|
| 324 |
+
insight = DistributionInsights()
|
| 325 |
+
result = insight.generate(sales_data, column='sales')
|
| 326 |
+
|
| 327 |
+
stats = result['statistics']
|
| 328 |
+
assert 'mean' in stats
|
| 329 |
+
assert 'median' in stats
|
| 330 |
+
assert 'std' in stats
|
| 331 |
+
assert 'skewness' in stats
|
| 332 |
+
assert 'kurtosis' in stats
|
| 333 |
+
|
| 334 |
+
def test_categorical_distribution(self, sales_data):
|
| 335 |
+
"""Test categorical distribution analysis."""
|
| 336 |
+
insight = DistributionInsights()
|
| 337 |
+
result = insight.generate(sales_data, column='product')
|
| 338 |
+
|
| 339 |
+
assert result['data_type'] == 'categorical'
|
| 340 |
+
assert 'value_counts' in result
|
| 341 |
+
assert 'most_common' in result['statistics']
|
| 342 |
+
|
| 343 |
+
def test_empty_column(self):
|
| 344 |
+
"""Test with empty column."""
|
| 345 |
+
df = pd.DataFrame({'col': [np.nan, np.nan, np.nan]})
|
| 346 |
+
|
| 347 |
+
insight = DistributionInsights()
|
| 348 |
+
result = insight.generate(df, column='col')
|
| 349 |
+
|
| 350 |
+
assert 'error' in result
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
# ============================================================================
|
| 354 |
+
# CORRELATION INSIGHTS TESTS
|
| 355 |
+
# ============================================================================
|
| 356 |
+
|
| 357 |
+
class TestCorrelationInsights:
|
| 358 |
+
"""Test suite for CorrelationInsights class."""
|
| 359 |
+
|
| 360 |
+
def test_initialization(self):
|
| 361 |
+
"""Test CorrelationInsights initialization."""
|
| 362 |
+
insight = CorrelationInsights()
|
| 363 |
+
assert insight is not None
|
| 364 |
+
|
| 365 |
+
def test_get_insight_type(self):
|
| 366 |
+
"""Test getting insight type."""
|
| 367 |
+
insight = CorrelationInsights()
|
| 368 |
+
assert insight.get_insight_type() == "correlation_insights"
|
| 369 |
+
|
| 370 |
+
def test_generate_correlations(self, correlation_data):
|
| 371 |
+
"""Test generating correlation insights."""
|
| 372 |
+
insight = CorrelationInsights()
|
| 373 |
+
result = insight.generate(correlation_data, threshold=0.5)
|
| 374 |
+
|
| 375 |
+
assert result['type'] == 'correlation_insights'
|
| 376 |
+
assert 'strong_correlations_found' in result
|
| 377 |
+
assert 'correlations' in result
|
| 378 |
+
|
| 379 |
+
def test_strong_correlations_found(self, correlation_data):
|
| 380 |
+
"""Test that strong correlations are found."""
|
| 381 |
+
insight = CorrelationInsights()
|
| 382 |
+
result = insight.generate(correlation_data, threshold=0.7)
|
| 383 |
+
|
| 384 |
+
# Should find strong correlations in our test data
|
| 385 |
+
assert result['strong_correlations_found'] > 0
|
| 386 |
+
|
| 387 |
+
def test_correlation_details(self, correlation_data):
|
| 388 |
+
"""Test correlation details."""
|
| 389 |
+
insight = CorrelationInsights()
|
| 390 |
+
result = insight.generate(correlation_data, threshold=0.5)
|
| 391 |
+
|
| 392 |
+
if len(result['correlations']) > 0:
|
| 393 |
+
corr = result['correlations'][0]
|
| 394 |
+
assert 'variable1' in corr
|
| 395 |
+
assert 'variable2' in corr
|
| 396 |
+
assert 'correlation' in corr
|
| 397 |
+
assert 'strength' in corr
|
| 398 |
+
assert 'direction' in corr
|
| 399 |
+
|
| 400 |
+
def test_different_methods(self, correlation_data):
|
| 401 |
+
"""Test different correlation methods."""
|
| 402 |
+
insight = CorrelationInsights()
|
| 403 |
+
|
| 404 |
+
# Pearson
|
| 405 |
+
result1 = insight.generate(correlation_data, method='pearson')
|
| 406 |
+
assert result1['method'] == 'pearson'
|
| 407 |
+
|
| 408 |
+
# Spearman
|
| 409 |
+
result2 = insight.generate(correlation_data, method='spearman')
|
| 410 |
+
assert result2['method'] == 'spearman'
|
| 411 |
+
|
| 412 |
+
def test_insufficient_columns(self):
|
| 413 |
+
"""Test with insufficient numerical columns."""
|
| 414 |
+
df = pd.DataFrame({'col': [1, 2, 3]})
|
| 415 |
+
|
| 416 |
+
insight = CorrelationInsights()
|
| 417 |
+
result = insight.generate(df)
|
| 418 |
+
|
| 419 |
+
assert 'error' in result
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
# ============================================================================
|
| 423 |
+
# INSIGHT MANAGER TESTS
|
| 424 |
+
# ============================================================================
|
| 425 |
+
|
| 426 |
+
class TestInsightManager:
|
| 427 |
+
"""Test suite for InsightManager class."""
|
| 428 |
+
|
| 429 |
+
def test_initialization(self):
|
| 430 |
+
"""Test InsightManager initialization."""
|
| 431 |
+
manager = InsightManager()
|
| 432 |
+
assert manager is not None
|
| 433 |
+
assert len(manager.strategies) >= 5
|
| 434 |
+
|
| 435 |
+
def test_get_available_insights(self):
|
| 436 |
+
"""Test getting available insights."""
|
| 437 |
+
manager = InsightManager()
|
| 438 |
+
available = manager.get_available_insights()
|
| 439 |
+
|
| 440 |
+
assert 'top_bottom' in available
|
| 441 |
+
assert 'trend' in available
|
| 442 |
+
assert 'anomaly' in available
|
| 443 |
+
assert 'distribution' in available
|
| 444 |
+
assert 'correlation' in available
|
| 445 |
+
|
| 446 |
+
def test_generate_top_bottom(self, sales_data):
|
| 447 |
+
"""Test generating top/bottom insight through manager."""
|
| 448 |
+
manager = InsightManager()
|
| 449 |
+
result = manager.generate_insight(
|
| 450 |
+
'top_bottom',
|
| 451 |
+
sales_data,
|
| 452 |
+
column='sales'
|
| 453 |
+
)
|
| 454 |
+
|
| 455 |
+
assert result['type'] == 'top_bottom_performers'
|
| 456 |
+
|
| 457 |
+
def test_generate_trend(self, time_series_data):
|
| 458 |
+
"""Test generating trend insight through manager."""
|
| 459 |
+
manager = InsightManager()
|
| 460 |
+
result = manager.generate_insight(
|
| 461 |
+
'trend',
|
| 462 |
+
time_series_data,
|
| 463 |
+
date_column='date',
|
| 464 |
+
value_column='value'
|
| 465 |
+
)
|
| 466 |
+
|
| 467 |
+
assert result['type'] == 'trend_analysis'
|
| 468 |
+
|
| 469 |
+
def test_generate_anomaly(self, anomaly_data):
|
| 470 |
+
"""Test generating anomaly insight through manager."""
|
| 471 |
+
manager = InsightManager()
|
| 472 |
+
result = manager.generate_insight(
|
| 473 |
+
'anomaly',
|
| 474 |
+
anomaly_data,
|
| 475 |
+
column='values'
|
| 476 |
+
)
|
| 477 |
+
|
| 478 |
+
assert result['type'] == 'anomaly_detection'
|
| 479 |
+
|
| 480 |
+
def test_generate_distribution(self, sales_data):
|
| 481 |
+
"""Test generating distribution insight through manager."""
|
| 482 |
+
manager = InsightManager()
|
| 483 |
+
result = manager.generate_insight(
|
| 484 |
+
'distribution',
|
| 485 |
+
sales_data,
|
| 486 |
+
column='sales'
|
| 487 |
+
)
|
| 488 |
+
|
| 489 |
+
assert result['type'] == 'distribution_insights'
|
| 490 |
+
|
| 491 |
+
def test_generate_correlation(self, correlation_data):
|
| 492 |
+
"""Test generating correlation insight through manager."""
|
| 493 |
+
manager = InsightManager()
|
| 494 |
+
result = manager.generate_insight(
|
| 495 |
+
'correlation',
|
| 496 |
+
correlation_data
|
| 497 |
+
)
|
| 498 |
+
|
| 499 |
+
assert result['type'] == 'correlation_insights'
|
| 500 |
+
|
| 501 |
+
def test_unsupported_insight_type(self, sales_data):
|
| 502 |
+
"""Test with unsupported insight type."""
|
| 503 |
+
manager = InsightManager()
|
| 504 |
+
|
| 505 |
+
with pytest.raises(ValueError, match="Unsupported insight type"):
|
| 506 |
+
manager.generate_insight('invalid_type', sales_data)
|
| 507 |
+
|
| 508 |
+
def test_generate_all_insights(self, mixed_data):
|
| 509 |
+
"""Test generating all insights."""
|
| 510 |
+
manager = InsightManager()
|
| 511 |
+
results = manager.generate_all_insights(mixed_data)
|
| 512 |
+
|
| 513 |
+
assert isinstance(results, dict)
|
| 514 |
+
# Should generate at least some insights
|
| 515 |
+
assert len(results) > 0
|
| 516 |
+
|
| 517 |
+
def test_add_strategy(self):
|
| 518 |
+
"""Test adding new strategy."""
|
| 519 |
+
manager = InsightManager()
|
| 520 |
+
initial_count = len(manager.strategies)
|
| 521 |
+
|
| 522 |
+
# Create mock strategy
|
| 523 |
+
class MockStrategy(InsightStrategy):
|
| 524 |
+
def generate(self, df, **kwargs):
|
| 525 |
+
return {'type': 'mock'}
|
| 526 |
+
|
| 527 |
+
def get_insight_type(self):
|
| 528 |
+
return 'mock'
|
| 529 |
+
|
| 530 |
+
manager.add_strategy('mock', MockStrategy())
|
| 531 |
+
assert len(manager.strategies) == initial_count + 1
|
| 532 |
+
assert 'mock' in manager.get_available_insights()
|
| 533 |
+
|
| 534 |
+
def test_format_insight_report(self, sales_data):
|
| 535 |
+
"""Test formatting insight report."""
|
| 536 |
+
manager = InsightManager()
|
| 537 |
+
insights = {
|
| 538 |
+
'top_bottom': manager.generate_insight(
|
| 539 |
+
'top_bottom', sales_data, column='sales'
|
| 540 |
+
)
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
report = manager.format_insight_report(insights)
|
| 544 |
+
assert isinstance(report, str)
|
| 545 |
+
assert 'INSIGHTS REPORT' in report
|
| 546 |
+
assert 'TOP BOTTOM' in report
|
| 547 |
+
|
| 548 |
+
|
| 549 |
+
# ============================================================================
|
| 550 |
+
# RUN TESTS
|
| 551 |
+
# ============================================================================
|
| 552 |
+
|
| 553 |
+
if __name__ == "__main__":
|
| 554 |
+
pytest.main([__file__, "-v", "--tb=short"])
|
tests/test_utils.py
ADDED
|
@@ -0,0 +1,436 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit Tests for Utils Module
|
| 3 |
+
|
| 4 |
+
Tests all utility functions and classes following best practices.
|
| 5 |
+
Uses pytest framework for comprehensive testing.
|
| 6 |
+
|
| 7 |
+
Author: Craig
|
| 8 |
+
Date: December 2024
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import pytest
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import numpy as np
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
import tempfile
|
| 16 |
+
import os
|
| 17 |
+
|
| 18 |
+
from utils import (
|
| 19 |
+
FileValidator, DataFrameValidator, ColumnValidator,
|
| 20 |
+
format_number, format_percentage, safe_divide,
|
| 21 |
+
get_column_types, detect_date_columns, clean_currency_column,
|
| 22 |
+
truncate_string, get_memory_usage,
|
| 23 |
+
CSVExporter, ExcelExporter, Config
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# ============================================================================
|
| 28 |
+
# FIXTURES
|
| 29 |
+
# Reusable test data following DRY principle
|
| 30 |
+
# ============================================================================
|
| 31 |
+
|
| 32 |
+
@pytest.fixture
|
| 33 |
+
def sample_dataframe():
|
| 34 |
+
"""Create a sample DataFrame for testing."""
|
| 35 |
+
return pd.DataFrame({
|
| 36 |
+
'age': [25, 30, 35, 40],
|
| 37 |
+
'name': ['Alice', 'Bob', 'Charlie', 'David'],
|
| 38 |
+
'salary': [50000, 60000, 70000, 80000],
|
| 39 |
+
'date': pd.date_range('2024-01-01', periods=4)
|
| 40 |
+
})
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@pytest.fixture
|
| 44 |
+
def empty_dataframe():
|
| 45 |
+
"""Create an empty DataFrame for testing."""
|
| 46 |
+
return pd.DataFrame()
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@pytest.fixture
|
| 50 |
+
def temp_csv_file():
|
| 51 |
+
"""Create a temporary CSV file."""
|
| 52 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
|
| 53 |
+
f.write('col1,col2\n1,2\n3,4\n')
|
| 54 |
+
temp_path = f.name
|
| 55 |
+
yield temp_path
|
| 56 |
+
# Cleanup
|
| 57 |
+
if os.path.exists(temp_path):
|
| 58 |
+
os.remove(temp_path)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@pytest.fixture
|
| 62 |
+
def temp_xlsx_file():
|
| 63 |
+
"""Create a temporary Excel file."""
|
| 64 |
+
temp_path = tempfile.mktemp(suffix='.xlsx')
|
| 65 |
+
df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
|
| 66 |
+
df.to_excel(temp_path, index=False)
|
| 67 |
+
yield temp_path
|
| 68 |
+
# Cleanup
|
| 69 |
+
if os.path.exists(temp_path):
|
| 70 |
+
os.remove(temp_path)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# ============================================================================
|
| 74 |
+
# VALIDATOR TESTS
|
| 75 |
+
# ============================================================================
|
| 76 |
+
|
| 77 |
+
class TestFileValidator:
|
| 78 |
+
"""Test suite for FileValidator class."""
|
| 79 |
+
|
| 80 |
+
def test_validate_existing_csv(self, temp_csv_file):
|
| 81 |
+
"""Test validation of existing CSV file."""
|
| 82 |
+
validator = FileValidator()
|
| 83 |
+
assert validator.validate(temp_csv_file) is True
|
| 84 |
+
|
| 85 |
+
def test_validate_existing_xlsx(self, temp_xlsx_file):
|
| 86 |
+
"""Test validation of existing Excel file."""
|
| 87 |
+
validator = FileValidator()
|
| 88 |
+
assert validator.validate(temp_xlsx_file) is True
|
| 89 |
+
|
| 90 |
+
def test_validate_nonexistent_file(self):
|
| 91 |
+
"""Test validation of non-existent file."""
|
| 92 |
+
validator = FileValidator()
|
| 93 |
+
with pytest.raises(FileNotFoundError):
|
| 94 |
+
validator.validate('nonexistent_file.csv')
|
| 95 |
+
|
| 96 |
+
def test_validate_unsupported_format(self):
|
| 97 |
+
"""Test validation of unsupported file format."""
|
| 98 |
+
validator = FileValidator()
|
| 99 |
+
with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as f:
|
| 100 |
+
temp_path = f.name
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
with pytest.raises(ValueError, match="Unsupported file format"):
|
| 104 |
+
validator.validate(temp_path)
|
| 105 |
+
finally:
|
| 106 |
+
if os.path.exists(temp_path):
|
| 107 |
+
os.remove(temp_path)
|
| 108 |
+
|
| 109 |
+
def test_supported_formats(self):
|
| 110 |
+
"""Test that all expected formats are supported."""
|
| 111 |
+
validator = FileValidator()
|
| 112 |
+
expected_formats = {'.csv', '.xlsx', '.xls', '.parquet', '.json', '.tsv'}
|
| 113 |
+
assert validator.SUPPORTED_FORMATS == expected_formats
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
class TestDataFrameValidator:
|
| 117 |
+
"""Test suite for DataFrameValidator class."""
|
| 118 |
+
|
| 119 |
+
def test_validate_valid_dataframe(self, sample_dataframe):
|
| 120 |
+
"""Test validation of valid DataFrame."""
|
| 121 |
+
validator = DataFrameValidator()
|
| 122 |
+
assert validator.validate(sample_dataframe) is True
|
| 123 |
+
|
| 124 |
+
def test_validate_empty_dataframe(self, empty_dataframe):
|
| 125 |
+
"""Test validation of empty DataFrame."""
|
| 126 |
+
validator = DataFrameValidator()
|
| 127 |
+
with pytest.raises(ValueError, match="DataFrame is empty"):
|
| 128 |
+
validator.validate(empty_dataframe)
|
| 129 |
+
|
| 130 |
+
def test_validate_none_dataframe(self):
|
| 131 |
+
"""Test validation of None DataFrame."""
|
| 132 |
+
validator = DataFrameValidator()
|
| 133 |
+
with pytest.raises(ValueError, match="DataFrame cannot be None"):
|
| 134 |
+
validator.validate(None)
|
| 135 |
+
|
| 136 |
+
def test_validate_wrong_type(self):
|
| 137 |
+
"""Test validation of wrong data type."""
|
| 138 |
+
validator = DataFrameValidator()
|
| 139 |
+
with pytest.raises(ValueError, match="Expected pandas DataFrame"):
|
| 140 |
+
validator.validate([1, 2, 3])
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
class TestColumnValidator:
|
| 144 |
+
"""Test suite for ColumnValidator class."""
|
| 145 |
+
|
| 146 |
+
def test_validate_existing_column(self, sample_dataframe):
|
| 147 |
+
"""Test validation of existing column."""
|
| 148 |
+
validator = ColumnValidator()
|
| 149 |
+
assert validator.validate(sample_dataframe, 'age') is True
|
| 150 |
+
|
| 151 |
+
def test_validate_existing_columns_list(self, sample_dataframe):
|
| 152 |
+
"""Test validation of multiple existing columns."""
|
| 153 |
+
validator = ColumnValidator()
|
| 154 |
+
assert validator.validate(sample_dataframe, ['age', 'name']) is True
|
| 155 |
+
|
| 156 |
+
def test_validate_missing_column(self, sample_dataframe):
|
| 157 |
+
"""Test validation of missing column."""
|
| 158 |
+
validator = ColumnValidator()
|
| 159 |
+
with pytest.raises(ValueError, match="Columns not found"):
|
| 160 |
+
validator.validate(sample_dataframe, 'nonexistent')
|
| 161 |
+
|
| 162 |
+
def test_validate_partial_missing_columns(self, sample_dataframe):
|
| 163 |
+
"""Test validation with some missing columns."""
|
| 164 |
+
validator = ColumnValidator()
|
| 165 |
+
with pytest.raises(ValueError, match="Columns not found"):
|
| 166 |
+
validator.validate(sample_dataframe, ['age', 'nonexistent'])
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
# ============================================================================
|
| 170 |
+
# FORMATTING FUNCTION TESTS
|
| 171 |
+
# ============================================================================
|
| 172 |
+
|
| 173 |
+
class TestFormatNumber:
|
| 174 |
+
"""Test suite for format_number function."""
|
| 175 |
+
|
| 176 |
+
def test_format_integer(self):
|
| 177 |
+
"""Test formatting integer."""
|
| 178 |
+
assert format_number(1234567) == "1,234,567"
|
| 179 |
+
|
| 180 |
+
def test_format_float(self):
|
| 181 |
+
"""Test formatting float."""
|
| 182 |
+
assert format_number(1234567.89) == "1,234,567.89"
|
| 183 |
+
|
| 184 |
+
def test_format_with_decimals(self):
|
| 185 |
+
"""Test formatting with specific decimal places."""
|
| 186 |
+
assert format_number(1234.5678, decimals=3) == "1,234.568"
|
| 187 |
+
|
| 188 |
+
def test_format_nan(self):
|
| 189 |
+
"""Test formatting NaN value."""
|
| 190 |
+
assert format_number(np.nan) == "N/A"
|
| 191 |
+
|
| 192 |
+
def test_format_none(self):
|
| 193 |
+
"""Test formatting None value."""
|
| 194 |
+
assert format_number(None) == "N/A"
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
class TestFormatPercentage:
|
| 198 |
+
"""Test suite for format_percentage function."""
|
| 199 |
+
|
| 200 |
+
def test_format_valid_percentage(self):
|
| 201 |
+
"""Test formatting valid percentage."""
|
| 202 |
+
assert format_percentage(0.456) == "45.60%"
|
| 203 |
+
|
| 204 |
+
def test_format_zero_percentage(self):
|
| 205 |
+
"""Test formatting zero percentage."""
|
| 206 |
+
assert format_percentage(0.0) == "0.00%"
|
| 207 |
+
|
| 208 |
+
def test_format_one_hundred_percent(self):
|
| 209 |
+
"""Test formatting 100%."""
|
| 210 |
+
assert format_percentage(1.0) == "100.00%"
|
| 211 |
+
|
| 212 |
+
def test_format_nan_percentage(self):
|
| 213 |
+
"""Test formatting NaN percentage."""
|
| 214 |
+
assert format_percentage(np.nan) == "N/A"
|
| 215 |
+
|
| 216 |
+
def test_format_custom_decimals(self):
|
| 217 |
+
"""Test formatting with custom decimal places."""
|
| 218 |
+
assert format_percentage(0.12345, decimals=3) == "12.345%"
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
class TestSafeDivide:
|
| 222 |
+
"""Test suite for safe_divide function."""
|
| 223 |
+
|
| 224 |
+
def test_normal_division(self):
|
| 225 |
+
"""Test normal division."""
|
| 226 |
+
assert safe_divide(10, 2) == 5.0
|
| 227 |
+
|
| 228 |
+
def test_division_by_zero(self):
|
| 229 |
+
"""Test division by zero returns default."""
|
| 230 |
+
assert safe_divide(10, 0, default=0.0) == 0.0
|
| 231 |
+
|
| 232 |
+
def test_division_by_nan(self):
|
| 233 |
+
"""Test division by NaN returns default."""
|
| 234 |
+
assert safe_divide(10, np.nan, default=-1.0) == -1.0
|
| 235 |
+
|
| 236 |
+
def test_custom_default(self):
|
| 237 |
+
"""Test custom default value."""
|
| 238 |
+
assert safe_divide(10, 0, default=999) == 999
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
# ============================================================================
|
| 242 |
+
# DATA ANALYSIS FUNCTION TESTS
|
| 243 |
+
# ============================================================================
|
| 244 |
+
|
| 245 |
+
class TestGetColumnTypes:
|
| 246 |
+
"""Test suite for get_column_types function."""
|
| 247 |
+
|
| 248 |
+
def test_mixed_types(self, sample_dataframe):
|
| 249 |
+
"""Test getting column types from mixed DataFrame."""
|
| 250 |
+
types = get_column_types(sample_dataframe)
|
| 251 |
+
assert 'age' in types['numerical']
|
| 252 |
+
assert 'salary' in types['numerical']
|
| 253 |
+
assert 'name' in types['categorical']
|
| 254 |
+
assert 'date' in types['datetime']
|
| 255 |
+
|
| 256 |
+
def test_only_numerical(self):
|
| 257 |
+
"""Test DataFrame with only numerical columns."""
|
| 258 |
+
df = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]})
|
| 259 |
+
types = get_column_types(df)
|
| 260 |
+
assert len(types['numerical']) == 2
|
| 261 |
+
assert len(types['categorical']) == 0
|
| 262 |
+
|
| 263 |
+
def test_only_categorical(self):
|
| 264 |
+
"""Test DataFrame with only categorical columns."""
|
| 265 |
+
df = pd.DataFrame({'a': ['x', 'y'], 'b': ['z', 'w']})
|
| 266 |
+
types = get_column_types(df)
|
| 267 |
+
assert len(types['categorical']) == 2
|
| 268 |
+
assert len(types['numerical']) == 0
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
class TestDetectDateColumns:
|
| 272 |
+
"""Test suite for detect_date_columns function."""
|
| 273 |
+
|
| 274 |
+
def test_detect_date_string_column(self):
|
| 275 |
+
"""Test detecting date strings."""
|
| 276 |
+
df = pd.DataFrame({
|
| 277 |
+
'date_col': ['2024-01-01', '2024-01-02', '2024-01-03'],
|
| 278 |
+
'text_col': ['abc', 'def', 'ghi']
|
| 279 |
+
})
|
| 280 |
+
date_cols = detect_date_columns(df)
|
| 281 |
+
assert 'date_col' in date_cols
|
| 282 |
+
assert 'text_col' not in date_cols
|
| 283 |
+
|
| 284 |
+
def test_no_date_columns(self):
|
| 285 |
+
"""Test DataFrame without date columns."""
|
| 286 |
+
df = pd.DataFrame({
|
| 287 |
+
'num': [1, 2, 3],
|
| 288 |
+
'text': ['a', 'b', 'c']
|
| 289 |
+
})
|
| 290 |
+
date_cols = detect_date_columns(df)
|
| 291 |
+
assert len(date_cols) == 0
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
class TestCleanCurrencyColumn:
|
| 295 |
+
"""Test suite for clean_currency_column function."""
|
| 296 |
+
|
| 297 |
+
def test_clean_dollar_signs(self):
|
| 298 |
+
"""Test cleaning dollar signs."""
|
| 299 |
+
s = pd.Series(['$1,234.56', '$789.00', '$1,000.00'])
|
| 300 |
+
result = clean_currency_column(s)
|
| 301 |
+
expected = pd.Series([1234.56, 789.00, 1000.00])
|
| 302 |
+
pd.testing.assert_series_equal(result, expected)
|
| 303 |
+
|
| 304 |
+
def test_clean_spaces(self):
|
| 305 |
+
"""Test cleaning spaces in currency."""
|
| 306 |
+
s = pd.Series(['$966 ', '$193 '])
|
| 307 |
+
result = clean_currency_column(s)
|
| 308 |
+
assert result[0] == 966.0
|
| 309 |
+
assert result[1] == 193.0
|
| 310 |
+
|
| 311 |
+
def test_handle_invalid_values(self):
|
| 312 |
+
"""Test handling invalid currency values."""
|
| 313 |
+
s = pd.Series(['$100', 'invalid', '$200'])
|
| 314 |
+
result = clean_currency_column(s)
|
| 315 |
+
assert result[0] == 100.0
|
| 316 |
+
assert pd.isna(result[1])
|
| 317 |
+
assert result[2] == 200.0
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
class TestTruncateString:
|
| 321 |
+
"""Test suite for truncate_string function."""
|
| 322 |
+
|
| 323 |
+
def test_truncate_long_string(self):
|
| 324 |
+
"""Test truncating long string."""
|
| 325 |
+
text = "This is a very long text that needs truncation"
|
| 326 |
+
result = truncate_string(text, max_length=20)
|
| 327 |
+
assert len(result) == 20
|
| 328 |
+
assert result.endswith("...")
|
| 329 |
+
|
| 330 |
+
def test_no_truncation_needed(self):
|
| 331 |
+
"""Test string that doesn't need truncation."""
|
| 332 |
+
text = "Short text"
|
| 333 |
+
result = truncate_string(text, max_length=20)
|
| 334 |
+
assert result == text
|
| 335 |
+
|
| 336 |
+
def test_custom_suffix(self):
|
| 337 |
+
"""Test custom truncation suffix."""
|
| 338 |
+
text = "Long text here"
|
| 339 |
+
result = truncate_string(text, max_length=10, suffix=">>")
|
| 340 |
+
assert result.endswith(">>")
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
class TestGetMemoryUsage:
|
| 344 |
+
"""Test suite for get_memory_usage function."""
|
| 345 |
+
|
| 346 |
+
def test_small_dataframe(self):
|
| 347 |
+
"""Test memory usage of small DataFrame."""
|
| 348 |
+
df = pd.DataFrame({'a': [1, 2, 3]})
|
| 349 |
+
usage = get_memory_usage(df)
|
| 350 |
+
assert 'B' in usage or 'KB' in usage
|
| 351 |
+
|
| 352 |
+
def test_returns_string(self, sample_dataframe):
|
| 353 |
+
"""Test that function returns string."""
|
| 354 |
+
usage = get_memory_usage(sample_dataframe)
|
| 355 |
+
assert isinstance(usage, str)
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
# ============================================================================
|
| 359 |
+
# EXPORTER TESTS
|
| 360 |
+
# ============================================================================
|
| 361 |
+
|
| 362 |
+
class TestCSVExporter:
|
| 363 |
+
"""Test suite for CSVExporter class."""
|
| 364 |
+
|
| 365 |
+
def test_export_csv(self, sample_dataframe):
|
| 366 |
+
"""Test exporting DataFrame to CSV."""
|
| 367 |
+
exporter = CSVExporter()
|
| 368 |
+
temp_path = tempfile.mktemp(suffix='.csv')
|
| 369 |
+
|
| 370 |
+
try:
|
| 371 |
+
result = exporter.export(sample_dataframe, temp_path)
|
| 372 |
+
assert result is True
|
| 373 |
+
assert os.path.exists(temp_path)
|
| 374 |
+
|
| 375 |
+
# Verify content
|
| 376 |
+
df_loaded = pd.read_csv(temp_path)
|
| 377 |
+
assert df_loaded.shape == sample_dataframe.shape
|
| 378 |
+
finally:
|
| 379 |
+
if os.path.exists(temp_path):
|
| 380 |
+
os.remove(temp_path)
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
class TestExcelExporter:
|
| 384 |
+
"""Test suite for ExcelExporter class."""
|
| 385 |
+
|
| 386 |
+
def test_export_excel(self, sample_dataframe):
|
| 387 |
+
"""Test exporting DataFrame to Excel."""
|
| 388 |
+
exporter = ExcelExporter()
|
| 389 |
+
temp_path = tempfile.mktemp(suffix='.xlsx')
|
| 390 |
+
|
| 391 |
+
try:
|
| 392 |
+
# Remove datetime column for Excel compatibility
|
| 393 |
+
df_test = sample_dataframe.drop('date', axis=1)
|
| 394 |
+
result = exporter.export(df_test, temp_path)
|
| 395 |
+
assert result is True
|
| 396 |
+
assert os.path.exists(temp_path)
|
| 397 |
+
|
| 398 |
+
# Verify content
|
| 399 |
+
df_loaded = pd.read_excel(temp_path)
|
| 400 |
+
assert df_loaded.shape == df_test.shape
|
| 401 |
+
finally:
|
| 402 |
+
if os.path.exists(temp_path):
|
| 403 |
+
os.remove(temp_path)
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
# ============================================================================
|
| 407 |
+
# CONFIG TESTS
|
| 408 |
+
# ============================================================================
|
| 409 |
+
|
| 410 |
+
class TestConfig:
|
| 411 |
+
"""Test suite for Config class."""
|
| 412 |
+
|
| 413 |
+
def test_supported_formats_exists(self):
|
| 414 |
+
"""Test that supported formats are defined."""
|
| 415 |
+
assert hasattr(Config, 'SUPPORTED_FILE_FORMATS')
|
| 416 |
+
assert len(Config.SUPPORTED_FILE_FORMATS) > 0
|
| 417 |
+
|
| 418 |
+
def test_display_settings_exist(self):
|
| 419 |
+
"""Test that display settings are defined."""
|
| 420 |
+
assert hasattr(Config, 'MAX_DISPLAY_ROWS')
|
| 421 |
+
assert hasattr(Config, 'MAX_STRING_LENGTH')
|
| 422 |
+
assert hasattr(Config, 'DEFAULT_DECIMAL_PLACES')
|
| 423 |
+
|
| 424 |
+
def test_config_values_valid(self):
|
| 425 |
+
"""Test that config values are valid."""
|
| 426 |
+
assert Config.MAX_DISPLAY_ROWS > 0
|
| 427 |
+
assert Config.MAX_STRING_LENGTH > 0
|
| 428 |
+
assert Config.DEFAULT_DECIMAL_PLACES >= 0
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
# ============================================================================
|
| 432 |
+
# RUN TESTS
|
| 433 |
+
# ============================================================================
|
| 434 |
+
|
| 435 |
+
if __name__ == "__main__":
|
| 436 |
+
pytest.main([__file__, "-v", "--tb=short"])
|
tests/test_visualizations.py
ADDED
|
@@ -0,0 +1,665 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit Tests for Visualizations Module
|
| 3 |
+
|
| 4 |
+
Comprehensive tests for all visualization strategies and the visualization manager.
|
| 5 |
+
|
| 6 |
+
Author: Craig
|
| 7 |
+
Date: December 2024
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import pytest
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import numpy as np
|
| 13 |
+
import matplotlib.pyplot as plt
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
import tempfile
|
| 16 |
+
import os
|
| 17 |
+
|
| 18 |
+
from visualizations import (
|
| 19 |
+
VisualizationStrategy, TimeSeriesPlot, DistributionPlot,
|
| 20 |
+
CategoryPlot, ScatterPlot, CorrelationHeatmap,
|
| 21 |
+
VisualizationManager, save_visualization
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ============================================================================
|
| 26 |
+
# FIXTURES
|
| 27 |
+
# ============================================================================
|
| 28 |
+
|
| 29 |
+
@pytest.fixture
|
| 30 |
+
def time_series_data():
|
| 31 |
+
"""Create sample time series data."""
|
| 32 |
+
dates = pd.date_range('2024-01-01', periods=100, freq='D')
|
| 33 |
+
return pd.DataFrame({
|
| 34 |
+
'date': dates,
|
| 35 |
+
'sales': np.random.randint(100, 1000, 100),
|
| 36 |
+
'revenue': np.random.uniform(1000, 5000, 100)
|
| 37 |
+
})
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@pytest.fixture
|
| 41 |
+
def numerical_data():
|
| 42 |
+
"""Create sample numerical data."""
|
| 43 |
+
np.random.seed(42)
|
| 44 |
+
return pd.DataFrame({
|
| 45 |
+
'values': np.random.normal(100, 15, 1000),
|
| 46 |
+
'scores': np.random.uniform(0, 100, 1000)
|
| 47 |
+
})
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@pytest.fixture
|
| 51 |
+
def categorical_data():
|
| 52 |
+
"""Create sample categorical data."""
|
| 53 |
+
return pd.DataFrame({
|
| 54 |
+
'category': ['A', 'B', 'C', 'D', 'E'] * 20,
|
| 55 |
+
'values': np.random.randint(10, 100, 100),
|
| 56 |
+
'region': np.random.choice(['North', 'South', 'East', 'West'], 100)
|
| 57 |
+
})
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@pytest.fixture
|
| 61 |
+
def scatter_data():
|
| 62 |
+
"""Create sample scatter plot data."""
|
| 63 |
+
np.random.seed(42)
|
| 64 |
+
x = np.random.uniform(0, 100, 200)
|
| 65 |
+
y = 2 * x + np.random.normal(0, 10, 200)
|
| 66 |
+
return pd.DataFrame({
|
| 67 |
+
'x_val': x,
|
| 68 |
+
'y_val': y,
|
| 69 |
+
'category': np.random.choice(['A', 'B', 'C'], 200),
|
| 70 |
+
'size': np.random.uniform(10, 100, 200)
|
| 71 |
+
})
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@pytest.fixture
|
| 75 |
+
def correlation_data():
|
| 76 |
+
"""Create sample data for correlation."""
|
| 77 |
+
np.random.seed(42)
|
| 78 |
+
return pd.DataFrame({
|
| 79 |
+
'var1': np.random.normal(50, 10, 100),
|
| 80 |
+
'var2': np.random.normal(100, 20, 100),
|
| 81 |
+
'var3': np.random.normal(75, 15, 100),
|
| 82 |
+
'var4': np.random.normal(60, 12, 100)
|
| 83 |
+
})
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# ============================================================================
|
| 87 |
+
# TIME SERIES PLOT TESTS
|
| 88 |
+
# ============================================================================
|
| 89 |
+
|
| 90 |
+
class TestTimeSeriesPlot:
|
| 91 |
+
"""Test suite for TimeSeriesPlot class."""
|
| 92 |
+
|
| 93 |
+
def test_initialization(self):
|
| 94 |
+
"""Test TimeSeriesPlot initialization."""
|
| 95 |
+
plot = TimeSeriesPlot()
|
| 96 |
+
assert plot is not None
|
| 97 |
+
|
| 98 |
+
def test_get_required_params(self):
|
| 99 |
+
"""Test getting required parameters."""
|
| 100 |
+
plot = TimeSeriesPlot()
|
| 101 |
+
params = plot.get_required_params()
|
| 102 |
+
assert 'date_column' in params
|
| 103 |
+
assert 'value_column' in params
|
| 104 |
+
|
| 105 |
+
def test_create_matplotlib_basic(self, time_series_data):
|
| 106 |
+
"""Test creating basic matplotlib time series plot."""
|
| 107 |
+
plot = TimeSeriesPlot()
|
| 108 |
+
fig = plot.create(time_series_data,
|
| 109 |
+
date_column='date',
|
| 110 |
+
value_column='sales',
|
| 111 |
+
backend='matplotlib')
|
| 112 |
+
|
| 113 |
+
assert fig is not None
|
| 114 |
+
assert hasattr(fig, 'savefig')
|
| 115 |
+
plt.close(fig)
|
| 116 |
+
|
| 117 |
+
def test_create_plotly_basic(self, time_series_data):
|
| 118 |
+
"""Test creating basic plotly time series plot."""
|
| 119 |
+
plot = TimeSeriesPlot()
|
| 120 |
+
fig = plot.create(time_series_data,
|
| 121 |
+
date_column='date',
|
| 122 |
+
value_column='sales',
|
| 123 |
+
backend='plotly')
|
| 124 |
+
|
| 125 |
+
assert fig is not None
|
| 126 |
+
assert hasattr(fig, 'write_html')
|
| 127 |
+
|
| 128 |
+
def test_aggregation_sum(self, time_series_data):
|
| 129 |
+
"""Test time series with sum aggregation."""
|
| 130 |
+
plot = TimeSeriesPlot()
|
| 131 |
+
fig = plot.create(time_series_data,
|
| 132 |
+
date_column='date',
|
| 133 |
+
value_column='sales',
|
| 134 |
+
aggregation='sum',
|
| 135 |
+
backend='matplotlib')
|
| 136 |
+
|
| 137 |
+
assert fig is not None
|
| 138 |
+
plt.close(fig)
|
| 139 |
+
|
| 140 |
+
def test_aggregation_mean(self, time_series_data):
|
| 141 |
+
"""Test time series with mean aggregation."""
|
| 142 |
+
plot = TimeSeriesPlot()
|
| 143 |
+
fig = plot.create(time_series_data,
|
| 144 |
+
date_column='date',
|
| 145 |
+
value_column='sales',
|
| 146 |
+
aggregation='mean',
|
| 147 |
+
backend='matplotlib')
|
| 148 |
+
|
| 149 |
+
assert fig is not None
|
| 150 |
+
plt.close(fig)
|
| 151 |
+
|
| 152 |
+
def test_invalid_date_column(self, time_series_data):
|
| 153 |
+
"""Test with invalid date column."""
|
| 154 |
+
plot = TimeSeriesPlot()
|
| 155 |
+
with pytest.raises(ValueError):
|
| 156 |
+
plot.create(time_series_data,
|
| 157 |
+
date_column='nonexistent',
|
| 158 |
+
value_column='sales')
|
| 159 |
+
|
| 160 |
+
def test_invalid_backend(self, time_series_data):
|
| 161 |
+
"""Test with invalid backend."""
|
| 162 |
+
plot = TimeSeriesPlot()
|
| 163 |
+
with pytest.raises(ValueError, match="Unsupported backend"):
|
| 164 |
+
plot.create(time_series_data,
|
| 165 |
+
date_column='date',
|
| 166 |
+
value_column='sales',
|
| 167 |
+
backend='invalid')
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# ============================================================================
|
| 171 |
+
# DISTRIBUTION PLOT TESTS
|
| 172 |
+
# ============================================================================
|
| 173 |
+
|
| 174 |
+
class TestDistributionPlot:
|
| 175 |
+
"""Test suite for DistributionPlot class."""
|
| 176 |
+
|
| 177 |
+
def test_initialization(self):
|
| 178 |
+
"""Test DistributionPlot initialization."""
|
| 179 |
+
plot = DistributionPlot()
|
| 180 |
+
assert plot is not None
|
| 181 |
+
|
| 182 |
+
def test_get_required_params(self):
|
| 183 |
+
"""Test getting required parameters."""
|
| 184 |
+
plot = DistributionPlot()
|
| 185 |
+
params = plot.get_required_params()
|
| 186 |
+
assert 'column' in params
|
| 187 |
+
|
| 188 |
+
def test_create_histogram_matplotlib(self, numerical_data):
|
| 189 |
+
"""Test creating histogram with matplotlib."""
|
| 190 |
+
plot = DistributionPlot()
|
| 191 |
+
fig = plot.create(numerical_data,
|
| 192 |
+
column='values',
|
| 193 |
+
plot_type='histogram',
|
| 194 |
+
backend='matplotlib')
|
| 195 |
+
|
| 196 |
+
assert fig is not None
|
| 197 |
+
plt.close(fig)
|
| 198 |
+
|
| 199 |
+
def test_create_box_matplotlib(self, numerical_data):
|
| 200 |
+
"""Test creating box plot with matplotlib."""
|
| 201 |
+
plot = DistributionPlot()
|
| 202 |
+
fig = plot.create(numerical_data,
|
| 203 |
+
column='values',
|
| 204 |
+
plot_type='box',
|
| 205 |
+
backend='matplotlib')
|
| 206 |
+
|
| 207 |
+
assert fig is not None
|
| 208 |
+
plt.close(fig)
|
| 209 |
+
|
| 210 |
+
def test_create_violin_matplotlib(self, numerical_data):
|
| 211 |
+
"""Test creating violin plot with matplotlib."""
|
| 212 |
+
plot = DistributionPlot()
|
| 213 |
+
fig = plot.create(numerical_data,
|
| 214 |
+
column='values',
|
| 215 |
+
plot_type='violin',
|
| 216 |
+
backend='matplotlib')
|
| 217 |
+
|
| 218 |
+
assert fig is not None
|
| 219 |
+
plt.close(fig)
|
| 220 |
+
|
| 221 |
+
def test_create_histogram_plotly(self, numerical_data):
|
| 222 |
+
"""Test creating histogram with plotly."""
|
| 223 |
+
plot = DistributionPlot()
|
| 224 |
+
fig = plot.create(numerical_data,
|
| 225 |
+
column='values',
|
| 226 |
+
plot_type='histogram',
|
| 227 |
+
backend='plotly')
|
| 228 |
+
|
| 229 |
+
assert fig is not None
|
| 230 |
+
|
| 231 |
+
def test_custom_bins(self, numerical_data):
|
| 232 |
+
"""Test histogram with custom bins."""
|
| 233 |
+
plot = DistributionPlot()
|
| 234 |
+
fig = plot.create(numerical_data,
|
| 235 |
+
column='values',
|
| 236 |
+
plot_type='histogram',
|
| 237 |
+
bins=50,
|
| 238 |
+
backend='matplotlib')
|
| 239 |
+
|
| 240 |
+
assert fig is not None
|
| 241 |
+
plt.close(fig)
|
| 242 |
+
|
| 243 |
+
def test_invalid_column(self, numerical_data):
|
| 244 |
+
"""Test with invalid column."""
|
| 245 |
+
plot = DistributionPlot()
|
| 246 |
+
with pytest.raises(ValueError):
|
| 247 |
+
plot.create(numerical_data, column='nonexistent')
|
| 248 |
+
|
| 249 |
+
def test_invalid_plot_type(self, numerical_data):
|
| 250 |
+
"""Test with invalid plot type."""
|
| 251 |
+
plot = DistributionPlot()
|
| 252 |
+
with pytest.raises(ValueError, match="Unsupported plot type"):
|
| 253 |
+
plot.create(numerical_data,
|
| 254 |
+
column='values',
|
| 255 |
+
plot_type='invalid',
|
| 256 |
+
backend='matplotlib')
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
# ============================================================================
|
| 260 |
+
# CATEGORY PLOT TESTS
|
| 261 |
+
# ============================================================================
|
| 262 |
+
|
| 263 |
+
class TestCategoryPlot:
|
| 264 |
+
"""Test suite for CategoryPlot class."""
|
| 265 |
+
|
| 266 |
+
def test_initialization(self):
|
| 267 |
+
"""Test CategoryPlot initialization."""
|
| 268 |
+
plot = CategoryPlot()
|
| 269 |
+
assert plot is not None
|
| 270 |
+
|
| 271 |
+
def test_get_required_params(self):
|
| 272 |
+
"""Test getting required parameters."""
|
| 273 |
+
plot = CategoryPlot()
|
| 274 |
+
params = plot.get_required_params()
|
| 275 |
+
assert 'column' in params
|
| 276 |
+
|
| 277 |
+
def test_create_bar_matplotlib(self, categorical_data):
|
| 278 |
+
"""Test creating bar chart with matplotlib."""
|
| 279 |
+
plot = CategoryPlot()
|
| 280 |
+
fig = plot.create(categorical_data,
|
| 281 |
+
column='category',
|
| 282 |
+
plot_type='bar',
|
| 283 |
+
backend='matplotlib')
|
| 284 |
+
|
| 285 |
+
assert fig is not None
|
| 286 |
+
plt.close(fig)
|
| 287 |
+
|
| 288 |
+
def test_create_pie_matplotlib(self, categorical_data):
|
| 289 |
+
"""Test creating pie chart with matplotlib."""
|
| 290 |
+
plot = CategoryPlot()
|
| 291 |
+
fig = plot.create(categorical_data,
|
| 292 |
+
column='category',
|
| 293 |
+
plot_type='pie',
|
| 294 |
+
backend='matplotlib')
|
| 295 |
+
|
| 296 |
+
assert fig is not None
|
| 297 |
+
plt.close(fig)
|
| 298 |
+
|
| 299 |
+
def test_create_bar_plotly(self, categorical_data):
|
| 300 |
+
"""Test creating bar chart with plotly."""
|
| 301 |
+
plot = CategoryPlot()
|
| 302 |
+
fig = plot.create(categorical_data,
|
| 303 |
+
column='category',
|
| 304 |
+
plot_type='bar',
|
| 305 |
+
backend='plotly')
|
| 306 |
+
|
| 307 |
+
assert fig is not None
|
| 308 |
+
|
| 309 |
+
def test_aggregation_sum(self, categorical_data):
|
| 310 |
+
"""Test with sum aggregation."""
|
| 311 |
+
plot = CategoryPlot()
|
| 312 |
+
fig = plot.create(categorical_data,
|
| 313 |
+
column='category',
|
| 314 |
+
value_column='values',
|
| 315 |
+
aggregation='sum',
|
| 316 |
+
backend='matplotlib')
|
| 317 |
+
|
| 318 |
+
assert fig is not None
|
| 319 |
+
plt.close(fig)
|
| 320 |
+
|
| 321 |
+
def test_top_n_categories(self, categorical_data):
|
| 322 |
+
"""Test showing only top N categories."""
|
| 323 |
+
plot = CategoryPlot()
|
| 324 |
+
fig = plot.create(categorical_data,
|
| 325 |
+
column='category',
|
| 326 |
+
top_n=3,
|
| 327 |
+
backend='matplotlib')
|
| 328 |
+
|
| 329 |
+
assert fig is not None
|
| 330 |
+
plt.close(fig)
|
| 331 |
+
|
| 332 |
+
def test_invalid_plot_type(self, categorical_data):
|
| 333 |
+
"""Test with invalid plot type."""
|
| 334 |
+
plot = CategoryPlot()
|
| 335 |
+
with pytest.raises(ValueError, match="Unsupported plot type"):
|
| 336 |
+
plot.create(categorical_data,
|
| 337 |
+
column='category',
|
| 338 |
+
plot_type='invalid',
|
| 339 |
+
backend='matplotlib')
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
# ============================================================================
|
| 343 |
+
# SCATTER PLOT TESTS
|
| 344 |
+
# ============================================================================
|
| 345 |
+
|
| 346 |
+
class TestScatterPlot:
|
| 347 |
+
"""Test suite for ScatterPlot class."""
|
| 348 |
+
|
| 349 |
+
def test_initialization(self):
|
| 350 |
+
"""Test ScatterPlot initialization."""
|
| 351 |
+
plot = ScatterPlot()
|
| 352 |
+
assert plot is not None
|
| 353 |
+
|
| 354 |
+
def test_get_required_params(self):
|
| 355 |
+
"""Test getting required parameters."""
|
| 356 |
+
plot = ScatterPlot()
|
| 357 |
+
params = plot.get_required_params()
|
| 358 |
+
assert 'x_column' in params
|
| 359 |
+
assert 'y_column' in params
|
| 360 |
+
|
| 361 |
+
def test_create_basic_matplotlib(self, scatter_data):
|
| 362 |
+
"""Test creating basic scatter plot with matplotlib."""
|
| 363 |
+
plot = ScatterPlot()
|
| 364 |
+
fig = plot.create(scatter_data,
|
| 365 |
+
x_column='x_val',
|
| 366 |
+
y_column='y_val',
|
| 367 |
+
backend='matplotlib')
|
| 368 |
+
|
| 369 |
+
assert fig is not None
|
| 370 |
+
plt.close(fig)
|
| 371 |
+
|
| 372 |
+
def test_create_basic_plotly(self, scatter_data):
|
| 373 |
+
"""Test creating basic scatter plot with plotly."""
|
| 374 |
+
plot = ScatterPlot()
|
| 375 |
+
fig = plot.create(scatter_data,
|
| 376 |
+
x_column='x_val',
|
| 377 |
+
y_column='y_val',
|
| 378 |
+
backend='plotly')
|
| 379 |
+
|
| 380 |
+
assert fig is not None
|
| 381 |
+
|
| 382 |
+
def test_with_color_column(self, scatter_data):
|
| 383 |
+
"""Test scatter plot with color coding."""
|
| 384 |
+
plot = ScatterPlot()
|
| 385 |
+
fig = plot.create(scatter_data,
|
| 386 |
+
x_column='x_val',
|
| 387 |
+
y_column='y_val',
|
| 388 |
+
color_column='category',
|
| 389 |
+
backend='matplotlib')
|
| 390 |
+
|
| 391 |
+
assert fig is not None
|
| 392 |
+
plt.close(fig)
|
| 393 |
+
|
| 394 |
+
def test_with_size_column(self, scatter_data):
|
| 395 |
+
"""Test scatter plot with size coding."""
|
| 396 |
+
plot = ScatterPlot()
|
| 397 |
+
fig = plot.create(scatter_data,
|
| 398 |
+
x_column='x_val',
|
| 399 |
+
y_column='y_val',
|
| 400 |
+
size_column='size',
|
| 401 |
+
backend='matplotlib')
|
| 402 |
+
|
| 403 |
+
assert fig is not None
|
| 404 |
+
plt.close(fig)
|
| 405 |
+
|
| 406 |
+
def test_with_trend_line(self, scatter_data):
|
| 407 |
+
"""Test scatter plot with trend line."""
|
| 408 |
+
plot = ScatterPlot()
|
| 409 |
+
fig = plot.create(scatter_data,
|
| 410 |
+
x_column='x_val',
|
| 411 |
+
y_column='y_val',
|
| 412 |
+
show_trend=True,
|
| 413 |
+
backend='matplotlib')
|
| 414 |
+
|
| 415 |
+
assert fig is not None
|
| 416 |
+
plt.close(fig)
|
| 417 |
+
|
| 418 |
+
def test_invalid_columns(self, scatter_data):
|
| 419 |
+
"""Test with invalid columns."""
|
| 420 |
+
plot = ScatterPlot()
|
| 421 |
+
with pytest.raises(ValueError):
|
| 422 |
+
plot.create(scatter_data,
|
| 423 |
+
x_column='nonexistent',
|
| 424 |
+
y_column='y_val')
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
# ============================================================================
|
| 428 |
+
# CORRELATION HEATMAP TESTS
|
| 429 |
+
# ============================================================================
|
| 430 |
+
|
| 431 |
+
class TestCorrelationHeatmap:
|
| 432 |
+
"""Test suite for CorrelationHeatmap class."""
|
| 433 |
+
|
| 434 |
+
def test_initialization(self):
|
| 435 |
+
"""Test CorrelationHeatmap initialization."""
|
| 436 |
+
plot = CorrelationHeatmap()
|
| 437 |
+
assert plot is not None
|
| 438 |
+
|
| 439 |
+
def test_get_required_params(self):
|
| 440 |
+
"""Test getting required parameters."""
|
| 441 |
+
plot = CorrelationHeatmap()
|
| 442 |
+
params = plot.get_required_params()
|
| 443 |
+
assert isinstance(params, list)
|
| 444 |
+
|
| 445 |
+
def test_create_matplotlib(self, correlation_data):
|
| 446 |
+
"""Test creating correlation heatmap with matplotlib."""
|
| 447 |
+
plot = CorrelationHeatmap()
|
| 448 |
+
fig = plot.create(correlation_data, backend='matplotlib')
|
| 449 |
+
|
| 450 |
+
assert fig is not None
|
| 451 |
+
plt.close(fig)
|
| 452 |
+
|
| 453 |
+
def test_create_plotly(self, correlation_data):
|
| 454 |
+
"""Test creating correlation heatmap with plotly."""
|
| 455 |
+
plot = CorrelationHeatmap()
|
| 456 |
+
fig = plot.create(correlation_data, backend='plotly')
|
| 457 |
+
|
| 458 |
+
assert fig is not None
|
| 459 |
+
|
| 460 |
+
def test_with_specific_columns(self, correlation_data):
|
| 461 |
+
"""Test heatmap with specific columns."""
|
| 462 |
+
plot = CorrelationHeatmap()
|
| 463 |
+
fig = plot.create(correlation_data,
|
| 464 |
+
columns=['var1', 'var2', 'var3'],
|
| 465 |
+
backend='matplotlib')
|
| 466 |
+
|
| 467 |
+
assert fig is not None
|
| 468 |
+
plt.close(fig)
|
| 469 |
+
|
| 470 |
+
def test_spearman_correlation(self, correlation_data):
|
| 471 |
+
"""Test with Spearman correlation."""
|
| 472 |
+
plot = CorrelationHeatmap()
|
| 473 |
+
fig = plot.create(correlation_data,
|
| 474 |
+
method='spearman',
|
| 475 |
+
backend='matplotlib')
|
| 476 |
+
|
| 477 |
+
assert fig is not None
|
| 478 |
+
plt.close(fig)
|
| 479 |
+
|
| 480 |
+
def test_insufficient_columns(self):
|
| 481 |
+
"""Test with insufficient numerical columns."""
|
| 482 |
+
df = pd.DataFrame({'col1': [1, 2, 3]})
|
| 483 |
+
plot = CorrelationHeatmap()
|
| 484 |
+
|
| 485 |
+
with pytest.raises(ValueError, match="at least 2 numerical columns"):
|
| 486 |
+
plot.create(df)
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
# ============================================================================
|
| 490 |
+
# VISUALIZATION MANAGER TESTS
|
| 491 |
+
# ============================================================================
|
| 492 |
+
|
| 493 |
+
class TestVisualizationManager:
|
| 494 |
+
"""Test suite for VisualizationManager class."""
|
| 495 |
+
|
| 496 |
+
def test_initialization(self):
|
| 497 |
+
"""Test VisualizationManager initialization."""
|
| 498 |
+
manager = VisualizationManager()
|
| 499 |
+
assert manager is not None
|
| 500 |
+
assert len(manager.strategies) >= 5
|
| 501 |
+
|
| 502 |
+
def test_get_available_visualizations(self):
|
| 503 |
+
"""Test getting available visualizations."""
|
| 504 |
+
manager = VisualizationManager()
|
| 505 |
+
available = manager.get_available_visualizations()
|
| 506 |
+
|
| 507 |
+
assert 'time_series' in available
|
| 508 |
+
assert 'distribution' in available
|
| 509 |
+
assert 'category' in available
|
| 510 |
+
assert 'scatter' in available
|
| 511 |
+
assert 'correlation' in available
|
| 512 |
+
|
| 513 |
+
def test_create_time_series(self, time_series_data):
|
| 514 |
+
"""Test creating time series through manager."""
|
| 515 |
+
manager = VisualizationManager()
|
| 516 |
+
fig = manager.create_visualization(
|
| 517 |
+
'time_series',
|
| 518 |
+
time_series_data,
|
| 519 |
+
date_column='date',
|
| 520 |
+
value_column='sales',
|
| 521 |
+
backend='matplotlib'
|
| 522 |
+
)
|
| 523 |
+
|
| 524 |
+
assert fig is not None
|
| 525 |
+
plt.close(fig)
|
| 526 |
+
|
| 527 |
+
def test_create_distribution(self, numerical_data):
|
| 528 |
+
"""Test creating distribution through manager."""
|
| 529 |
+
manager = VisualizationManager()
|
| 530 |
+
fig = manager.create_visualization(
|
| 531 |
+
'distribution',
|
| 532 |
+
numerical_data,
|
| 533 |
+
column='values',
|
| 534 |
+
backend='matplotlib'
|
| 535 |
+
)
|
| 536 |
+
|
| 537 |
+
assert fig is not None
|
| 538 |
+
plt.close(fig)
|
| 539 |
+
|
| 540 |
+
def test_create_category(self, categorical_data):
|
| 541 |
+
"""Test creating category plot through manager."""
|
| 542 |
+
manager = VisualizationManager()
|
| 543 |
+
fig = manager.create_visualization(
|
| 544 |
+
'category',
|
| 545 |
+
categorical_data,
|
| 546 |
+
column='category',
|
| 547 |
+
backend='matplotlib'
|
| 548 |
+
)
|
| 549 |
+
|
| 550 |
+
assert fig is not None
|
| 551 |
+
plt.close(fig)
|
| 552 |
+
|
| 553 |
+
def test_create_scatter(self, scatter_data):
|
| 554 |
+
"""Test creating scatter plot through manager."""
|
| 555 |
+
manager = VisualizationManager()
|
| 556 |
+
fig = manager.create_visualization(
|
| 557 |
+
'scatter',
|
| 558 |
+
scatter_data,
|
| 559 |
+
x_column='x_val',
|
| 560 |
+
y_column='y_val',
|
| 561 |
+
backend='matplotlib'
|
| 562 |
+
)
|
| 563 |
+
|
| 564 |
+
assert fig is not None
|
| 565 |
+
plt.close(fig)
|
| 566 |
+
|
| 567 |
+
def test_create_correlation(self, correlation_data):
|
| 568 |
+
"""Test creating correlation heatmap through manager."""
|
| 569 |
+
manager = VisualizationManager()
|
| 570 |
+
fig = manager.create_visualization(
|
| 571 |
+
'correlation',
|
| 572 |
+
correlation_data,
|
| 573 |
+
backend='matplotlib'
|
| 574 |
+
)
|
| 575 |
+
|
| 576 |
+
assert fig is not None
|
| 577 |
+
plt.close(fig)
|
| 578 |
+
|
| 579 |
+
def test_unsupported_visualization_type(self, numerical_data):
|
| 580 |
+
"""Test with unsupported visualization type."""
|
| 581 |
+
manager = VisualizationManager()
|
| 582 |
+
|
| 583 |
+
with pytest.raises(ValueError, match="Unsupported visualization type"):
|
| 584 |
+
manager.create_visualization('invalid_type', numerical_data)
|
| 585 |
+
|
| 586 |
+
def test_add_strategy(self):
|
| 587 |
+
"""Test adding new strategy."""
|
| 588 |
+
manager = VisualizationManager()
|
| 589 |
+
initial_count = len(manager.strategies)
|
| 590 |
+
|
| 591 |
+
# Create mock strategy
|
| 592 |
+
class MockStrategy(VisualizationStrategy):
|
| 593 |
+
def create(self, df, **kwargs):
|
| 594 |
+
return None
|
| 595 |
+
|
| 596 |
+
def get_required_params(self):
|
| 597 |
+
return []
|
| 598 |
+
|
| 599 |
+
manager.add_strategy('mock', MockStrategy())
|
| 600 |
+
assert len(manager.strategies) == initial_count + 1
|
| 601 |
+
assert 'mock' in manager.get_available_visualizations()
|
| 602 |
+
|
| 603 |
+
def test_get_required_params(self):
|
| 604 |
+
"""Test getting required params for visualization type."""
|
| 605 |
+
manager = VisualizationManager()
|
| 606 |
+
params = manager.get_required_params('time_series')
|
| 607 |
+
|
| 608 |
+
assert isinstance(params, list)
|
| 609 |
+
assert 'date_column' in params
|
| 610 |
+
assert 'value_column' in params
|
| 611 |
+
|
| 612 |
+
def test_get_required_params_invalid_type(self):
|
| 613 |
+
"""Test getting params for invalid type."""
|
| 614 |
+
manager = VisualizationManager()
|
| 615 |
+
|
| 616 |
+
with pytest.raises(ValueError):
|
| 617 |
+
manager.get_required_params('invalid_type')
|
| 618 |
+
|
| 619 |
+
|
| 620 |
+
# ============================================================================
|
| 621 |
+
# SAVE VISUALIZATION TESTS
|
| 622 |
+
# ============================================================================
|
| 623 |
+
|
| 624 |
+
class TestSaveVisualization:
|
| 625 |
+
"""Test suite for save_visualization function."""
|
| 626 |
+
|
| 627 |
+
def test_save_matplotlib_png(self, numerical_data):
|
| 628 |
+
"""Test saving matplotlib figure as PNG."""
|
| 629 |
+
plot = DistributionPlot()
|
| 630 |
+
fig = plot.create(numerical_data, column='values', backend='matplotlib')
|
| 631 |
+
|
| 632 |
+
temp_path = tempfile.mktemp(suffix='.png')
|
| 633 |
+
|
| 634 |
+
try:
|
| 635 |
+
result = save_visualization(fig, temp_path, format='png')
|
| 636 |
+
assert result is True
|
| 637 |
+
assert os.path.exists(temp_path)
|
| 638 |
+
finally:
|
| 639 |
+
plt.close(fig)
|
| 640 |
+
if os.path.exists(temp_path):
|
| 641 |
+
os.remove(temp_path)
|
| 642 |
+
|
| 643 |
+
def test_save_matplotlib_pdf(self, numerical_data):
|
| 644 |
+
"""Test saving matplotlib figure as PDF."""
|
| 645 |
+
plot = DistributionPlot()
|
| 646 |
+
fig = plot.create(numerical_data, column='values', backend='matplotlib')
|
| 647 |
+
|
| 648 |
+
temp_path = tempfile.mktemp(suffix='.pdf')
|
| 649 |
+
|
| 650 |
+
try:
|
| 651 |
+
result = save_visualization(fig, temp_path, format='pdf')
|
| 652 |
+
assert result is True
|
| 653 |
+
assert os.path.exists(temp_path)
|
| 654 |
+
finally:
|
| 655 |
+
plt.close(fig)
|
| 656 |
+
if os.path.exists(temp_path):
|
| 657 |
+
os.remove(temp_path)
|
| 658 |
+
|
| 659 |
+
|
| 660 |
+
# ============================================================================
|
| 661 |
+
# RUN TESTS
|
| 662 |
+
# ============================================================================
|
| 663 |
+
|
| 664 |
+
if __name__ == "__main__":
|
| 665 |
+
pytest.main([__file__, "-v", "--tb=short"])
|
utils.py
ADDED
|
@@ -0,0 +1,480 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility Module for Business Intelligence Dashboard
|
| 3 |
+
|
| 4 |
+
This module provides helper functions and utilities following SOLID principles.
|
| 5 |
+
Implements Single Responsibility Principle - each function has one clear purpose.
|
| 6 |
+
|
| 7 |
+
Author: Craig
|
| 8 |
+
Date: December 2024
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import numpy as np
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Union, Optional, List, Any
|
| 15 |
+
import logging
|
| 16 |
+
from abc import ABC, abstractmethod
|
| 17 |
+
|
| 18 |
+
# Configure logging
|
| 19 |
+
logging.basicConfig(level=logging.INFO)
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# ============================================================================
|
| 24 |
+
# INTERFACE SEGREGATION PRINCIPLE (ISP)
|
| 25 |
+
# Define specific interfaces for different validation types
|
| 26 |
+
# ============================================================================
|
| 27 |
+
|
| 28 |
+
class DataValidator(ABC):
|
| 29 |
+
"""
|
| 30 |
+
Abstract base class for data validation.
|
| 31 |
+
Follows Interface Segregation Principle - clients depend only on methods they use.
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
@abstractmethod
|
| 35 |
+
def validate(self, data: Any) -> bool:
|
| 36 |
+
"""
|
| 37 |
+
Validate the given data.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
data: Data to validate
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
bool: True if validation passes, False otherwise
|
| 44 |
+
"""
|
| 45 |
+
pass
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class FileValidator(DataValidator):
|
| 49 |
+
"""
|
| 50 |
+
Validates file existence and format.
|
| 51 |
+
Follows Single Responsibility Principle - only handles file validation.
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
SUPPORTED_FORMATS = {'.csv', '.xlsx', '.xls', '.parquet', '.json', '.tsv'}
|
| 55 |
+
|
| 56 |
+
def validate(self, file_path: Union[str, Path]) -> bool:
|
| 57 |
+
"""
|
| 58 |
+
Validate if file exists and has supported format.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
file_path: Path to the file
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
bool: True if file is valid, False otherwise
|
| 65 |
+
|
| 66 |
+
Raises:
|
| 67 |
+
FileNotFoundError: If file doesn't exist
|
| 68 |
+
ValueError: If file format is not supported
|
| 69 |
+
"""
|
| 70 |
+
path = Path(file_path)
|
| 71 |
+
|
| 72 |
+
if not path.exists():
|
| 73 |
+
logger.error(f"File not found: {file_path}")
|
| 74 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
| 75 |
+
|
| 76 |
+
if path.suffix.lower() not in self.SUPPORTED_FORMATS:
|
| 77 |
+
logger.error(f"Unsupported format: {path.suffix}")
|
| 78 |
+
raise ValueError(
|
| 79 |
+
f"Unsupported file format: {path.suffix}. "
|
| 80 |
+
f"Supported formats: {', '.join(self.SUPPORTED_FORMATS)}"
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
logger.info(f"File validation passed: {file_path}")
|
| 84 |
+
return True
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class DataFrameValidator(DataValidator):
|
| 88 |
+
"""
|
| 89 |
+
Validates pandas DataFrame properties.
|
| 90 |
+
Follows Single Responsibility Principle - only handles DataFrame validation.
|
| 91 |
+
"""
|
| 92 |
+
|
| 93 |
+
def validate(self, df: pd.DataFrame) -> bool:
|
| 94 |
+
"""
|
| 95 |
+
Validate if DataFrame is valid and not empty.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
df: DataFrame to validate
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
bool: True if DataFrame is valid, False otherwise
|
| 102 |
+
|
| 103 |
+
Raises:
|
| 104 |
+
ValueError: If DataFrame is None or empty
|
| 105 |
+
"""
|
| 106 |
+
if df is None:
|
| 107 |
+
logger.error("DataFrame is None")
|
| 108 |
+
raise ValueError("DataFrame cannot be None")
|
| 109 |
+
|
| 110 |
+
if not isinstance(df, pd.DataFrame):
|
| 111 |
+
logger.error(f"Expected DataFrame, got {type(df)}")
|
| 112 |
+
raise ValueError(f"Expected pandas DataFrame, got {type(df)}")
|
| 113 |
+
|
| 114 |
+
if df.empty:
|
| 115 |
+
logger.error("DataFrame is empty")
|
| 116 |
+
raise ValueError("DataFrame is empty")
|
| 117 |
+
|
| 118 |
+
logger.info(f"DataFrame validation passed: {df.shape[0]} rows, {df.shape[1]} columns")
|
| 119 |
+
return True
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class ColumnValidator(DataValidator):
|
| 123 |
+
"""
|
| 124 |
+
Validates column existence in DataFrame.
|
| 125 |
+
Follows Single Responsibility Principle - only handles column validation.
|
| 126 |
+
"""
|
| 127 |
+
|
| 128 |
+
def validate(self, df: pd.DataFrame, columns: Union[str, List[str]]) -> bool:
|
| 129 |
+
"""
|
| 130 |
+
Validate if specified columns exist in DataFrame.
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
df: DataFrame to check
|
| 134 |
+
columns: Column name(s) to validate
|
| 135 |
+
|
| 136 |
+
Returns:
|
| 137 |
+
bool: True if all columns exist, False otherwise
|
| 138 |
+
|
| 139 |
+
Raises:
|
| 140 |
+
ValueError: If any column doesn't exist
|
| 141 |
+
"""
|
| 142 |
+
if isinstance(columns, str):
|
| 143 |
+
columns = [columns]
|
| 144 |
+
|
| 145 |
+
missing_columns = [col for col in columns if col not in df.columns]
|
| 146 |
+
|
| 147 |
+
if missing_columns:
|
| 148 |
+
logger.error(f"Missing columns: {missing_columns}")
|
| 149 |
+
raise ValueError(
|
| 150 |
+
f"Columns not found in DataFrame: {', '.join(missing_columns)}"
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
logger.info(f"Column validation passed: {columns}")
|
| 154 |
+
return True
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
# ============================================================================
|
| 158 |
+
# UTILITY FUNCTIONS
|
| 159 |
+
# These follow Single Responsibility Principle
|
| 160 |
+
# ============================================================================
|
| 161 |
+
|
| 162 |
+
def format_number(number: Union[int, float], decimals: int = 2) -> str:
|
| 163 |
+
"""
|
| 164 |
+
Format a number for display with thousand separators.
|
| 165 |
+
|
| 166 |
+
Args:
|
| 167 |
+
number: Number to format
|
| 168 |
+
decimals: Number of decimal places
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
str: Formatted number string
|
| 172 |
+
|
| 173 |
+
Example:
|
| 174 |
+
>>> format_number(1234567.89)
|
| 175 |
+
'1,234,567.89'
|
| 176 |
+
"""
|
| 177 |
+
try:
|
| 178 |
+
if pd.isna(number):
|
| 179 |
+
return "N/A"
|
| 180 |
+
|
| 181 |
+
if isinstance(number, (int, np.integer)):
|
| 182 |
+
return f"{number:,}"
|
| 183 |
+
|
| 184 |
+
return f"{number:,.{decimals}f}"
|
| 185 |
+
except (ValueError, TypeError) as e:
|
| 186 |
+
logger.warning(f"Error formatting number {number}: {e}")
|
| 187 |
+
return str(number)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def format_percentage(value: float, decimals: int = 2) -> str:
|
| 191 |
+
"""
|
| 192 |
+
Format a value as percentage.
|
| 193 |
+
|
| 194 |
+
Args:
|
| 195 |
+
value: Value to format (0.5 = 50%)
|
| 196 |
+
decimals: Number of decimal places
|
| 197 |
+
|
| 198 |
+
Returns:
|
| 199 |
+
str: Formatted percentage string
|
| 200 |
+
|
| 201 |
+
Example:
|
| 202 |
+
>>> format_percentage(0.456)
|
| 203 |
+
'45.60%'
|
| 204 |
+
"""
|
| 205 |
+
try:
|
| 206 |
+
if pd.isna(value):
|
| 207 |
+
return "N/A"
|
| 208 |
+
return f"{value * 100:.{decimals}f}%"
|
| 209 |
+
except (ValueError, TypeError) as e:
|
| 210 |
+
logger.warning(f"Error formatting percentage {value}: {e}")
|
| 211 |
+
return str(value)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
|
| 215 |
+
"""
|
| 216 |
+
Safely divide two numbers, returning default if division by zero.
|
| 217 |
+
|
| 218 |
+
Args:
|
| 219 |
+
numerator: Numerator value
|
| 220 |
+
denominator: Denominator value
|
| 221 |
+
default: Default value to return if division fails
|
| 222 |
+
|
| 223 |
+
Returns:
|
| 224 |
+
float: Result of division or default value
|
| 225 |
+
|
| 226 |
+
Example:
|
| 227 |
+
>>> safe_divide(10, 2)
|
| 228 |
+
5.0
|
| 229 |
+
>>> safe_divide(10, 0, default=0)
|
| 230 |
+
0.0
|
| 231 |
+
"""
|
| 232 |
+
try:
|
| 233 |
+
if denominator == 0 or pd.isna(denominator):
|
| 234 |
+
return default
|
| 235 |
+
return numerator / denominator
|
| 236 |
+
except (ValueError, TypeError, ZeroDivisionError):
|
| 237 |
+
return default
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def get_column_types(df: pd.DataFrame) -> dict:
|
| 241 |
+
"""
|
| 242 |
+
Categorize DataFrame columns by data type.
|
| 243 |
+
|
| 244 |
+
Args:
|
| 245 |
+
df: DataFrame to analyze
|
| 246 |
+
|
| 247 |
+
Returns:
|
| 248 |
+
dict: Dictionary with keys 'numerical', 'categorical', 'datetime'
|
| 249 |
+
|
| 250 |
+
Example:
|
| 251 |
+
>>> df = pd.DataFrame({'age': [25, 30], 'name': ['Alice', 'Bob']})
|
| 252 |
+
>>> types = get_column_types(df)
|
| 253 |
+
>>> types['numerical']
|
| 254 |
+
['age']
|
| 255 |
+
"""
|
| 256 |
+
numerical = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 257 |
+
categorical = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 258 |
+
datetime = df.select_dtypes(include=['datetime64']).columns.tolist()
|
| 259 |
+
|
| 260 |
+
return {
|
| 261 |
+
'numerical': numerical,
|
| 262 |
+
'categorical': categorical,
|
| 263 |
+
'datetime': datetime
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def detect_date_columns(df: pd.DataFrame, sample_size: int = 100) -> List[str]:
|
| 268 |
+
"""
|
| 269 |
+
Detect columns that might contain date strings.
|
| 270 |
+
|
| 271 |
+
Args:
|
| 272 |
+
df: DataFrame to analyze
|
| 273 |
+
sample_size: Number of rows to sample for detection
|
| 274 |
+
|
| 275 |
+
Returns:
|
| 276 |
+
List[str]: List of potential date column names
|
| 277 |
+
"""
|
| 278 |
+
potential_date_cols = []
|
| 279 |
+
|
| 280 |
+
for col in df.select_dtypes(include=['object']).columns:
|
| 281 |
+
sample = df[col].dropna().head(sample_size)
|
| 282 |
+
|
| 283 |
+
if len(sample) == 0:
|
| 284 |
+
continue
|
| 285 |
+
|
| 286 |
+
# Try to parse as dates
|
| 287 |
+
try:
|
| 288 |
+
pd.to_datetime(sample, errors='coerce')
|
| 289 |
+
# If more than 50% parse successfully, consider it a date column
|
| 290 |
+
parsed = pd.to_datetime(sample, errors='coerce')
|
| 291 |
+
if parsed.notna().sum() / len(sample) > 0.5:
|
| 292 |
+
potential_date_cols.append(col)
|
| 293 |
+
except Exception:
|
| 294 |
+
continue
|
| 295 |
+
|
| 296 |
+
return potential_date_cols
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def clean_currency_column(series: pd.Series) -> pd.Series:
|
| 300 |
+
"""
|
| 301 |
+
Clean currency columns by removing symbols and converting to float.
|
| 302 |
+
|
| 303 |
+
Args:
|
| 304 |
+
series: Pandas Series with currency values
|
| 305 |
+
|
| 306 |
+
Returns:
|
| 307 |
+
pd.Series: Cleaned numeric series
|
| 308 |
+
|
| 309 |
+
Example:
|
| 310 |
+
>>> s = pd.Series(['$1,234.56', '$789.00'])
|
| 311 |
+
>>> clean_currency_column(s)
|
| 312 |
+
0 1234.56
|
| 313 |
+
1 789.00
|
| 314 |
+
dtype: float64
|
| 315 |
+
"""
|
| 316 |
+
try:
|
| 317 |
+
# Remove currency symbols, commas, and spaces
|
| 318 |
+
cleaned = series.astype(str).str.replace(r'[$,€£¥\s]', '', regex=True)
|
| 319 |
+
return pd.to_numeric(cleaned, errors='coerce')
|
| 320 |
+
except Exception as e:
|
| 321 |
+
logger.warning(f"Error cleaning currency column: {e}")
|
| 322 |
+
return series
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def truncate_string(text: str, max_length: int = 50, suffix: str = "...") -> str:
|
| 326 |
+
"""
|
| 327 |
+
Truncate a string to maximum length.
|
| 328 |
+
|
| 329 |
+
Args:
|
| 330 |
+
text: Text to truncate
|
| 331 |
+
max_length: Maximum length
|
| 332 |
+
suffix: Suffix to add if truncated
|
| 333 |
+
|
| 334 |
+
Returns:
|
| 335 |
+
str: Truncated string
|
| 336 |
+
|
| 337 |
+
Example:
|
| 338 |
+
>>> truncate_string("This is a very long text", 10)
|
| 339 |
+
'This is...'
|
| 340 |
+
"""
|
| 341 |
+
if not isinstance(text, str):
|
| 342 |
+
text = str(text)
|
| 343 |
+
|
| 344 |
+
if len(text) <= max_length:
|
| 345 |
+
return text
|
| 346 |
+
|
| 347 |
+
return text[:max_length - len(suffix)] + suffix
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
def get_memory_usage(df: pd.DataFrame) -> str:
|
| 351 |
+
"""
|
| 352 |
+
Get human-readable memory usage of DataFrame.
|
| 353 |
+
|
| 354 |
+
Args:
|
| 355 |
+
df: DataFrame to analyze
|
| 356 |
+
|
| 357 |
+
Returns:
|
| 358 |
+
str: Memory usage string (e.g., "2.5 MB")
|
| 359 |
+
"""
|
| 360 |
+
memory_bytes = df.memory_usage(deep=True).sum()
|
| 361 |
+
|
| 362 |
+
for unit in ['B', 'KB', 'MB', 'GB']:
|
| 363 |
+
if memory_bytes < 1024.0:
|
| 364 |
+
return f"{memory_bytes:.2f} {unit}"
|
| 365 |
+
memory_bytes /= 1024.0
|
| 366 |
+
|
| 367 |
+
return f"{memory_bytes:.2f} TB"
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
# ============================================================================
|
| 371 |
+
# EXPORT UTILITIES
|
| 372 |
+
# These follow Single Responsibility Principle
|
| 373 |
+
# ============================================================================
|
| 374 |
+
|
| 375 |
+
class DataExporter(ABC):
|
| 376 |
+
"""
|
| 377 |
+
Abstract base class for data export.
|
| 378 |
+
Follows Open/Closed Principle - open for extension, closed for modification.
|
| 379 |
+
"""
|
| 380 |
+
|
| 381 |
+
@abstractmethod
|
| 382 |
+
def export(self, data: Any, filepath: Union[str, Path]) -> bool:
|
| 383 |
+
"""
|
| 384 |
+
Export data to file.
|
| 385 |
+
|
| 386 |
+
Args:
|
| 387 |
+
data: Data to export
|
| 388 |
+
filepath: Destination file path
|
| 389 |
+
|
| 390 |
+
Returns:
|
| 391 |
+
bool: True if export successful, False otherwise
|
| 392 |
+
"""
|
| 393 |
+
pass
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
class CSVExporter(DataExporter):
|
| 397 |
+
"""
|
| 398 |
+
Export DataFrame to CSV format.
|
| 399 |
+
Follows Single Responsibility Principle.
|
| 400 |
+
"""
|
| 401 |
+
|
| 402 |
+
def export(self, df: pd.DataFrame, filepath: Union[str, Path]) -> bool:
|
| 403 |
+
"""
|
| 404 |
+
Export DataFrame to CSV file.
|
| 405 |
+
|
| 406 |
+
Args:
|
| 407 |
+
df: DataFrame to export
|
| 408 |
+
filepath: Destination CSV file path
|
| 409 |
+
|
| 410 |
+
Returns:
|
| 411 |
+
bool: True if export successful, False otherwise
|
| 412 |
+
"""
|
| 413 |
+
try:
|
| 414 |
+
df.to_csv(filepath, index=False)
|
| 415 |
+
logger.info(f"Successfully exported to CSV: {filepath}")
|
| 416 |
+
return True
|
| 417 |
+
except Exception as e:
|
| 418 |
+
logger.error(f"Error exporting to CSV: {e}")
|
| 419 |
+
return False
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
class ExcelExporter(DataExporter):
|
| 423 |
+
"""
|
| 424 |
+
Export DataFrame to Excel format.
|
| 425 |
+
Follows Single Responsibility Principle.
|
| 426 |
+
"""
|
| 427 |
+
|
| 428 |
+
def export(self, df: pd.DataFrame, filepath: Union[str, Path]) -> bool:
|
| 429 |
+
"""
|
| 430 |
+
Export DataFrame to Excel file.
|
| 431 |
+
|
| 432 |
+
Args:
|
| 433 |
+
df: DataFrame to export
|
| 434 |
+
filepath: Destination Excel file path
|
| 435 |
+
|
| 436 |
+
Returns:
|
| 437 |
+
bool: True if export successful, False otherwise
|
| 438 |
+
"""
|
| 439 |
+
try:
|
| 440 |
+
df.to_excel(filepath, index=False, engine='openpyxl')
|
| 441 |
+
logger.info(f"Successfully exported to Excel: {filepath}")
|
| 442 |
+
return True
|
| 443 |
+
except Exception as e:
|
| 444 |
+
logger.error(f"Error exporting to Excel: {e}")
|
| 445 |
+
return False
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
# ============================================================================
|
| 449 |
+
# CONSTANTS
|
| 450 |
+
# Centralized configuration following DRY principle
|
| 451 |
+
# ============================================================================
|
| 452 |
+
|
| 453 |
+
class Config:
|
| 454 |
+
"""
|
| 455 |
+
Configuration constants for the application.
|
| 456 |
+
Centralized configuration following Single Responsibility Principle.
|
| 457 |
+
"""
|
| 458 |
+
|
| 459 |
+
# File formats
|
| 460 |
+
SUPPORTED_FILE_FORMATS = {'.csv', '.xlsx', '.xls', '.parquet', '.json', '.tsv'}
|
| 461 |
+
|
| 462 |
+
# Display settings
|
| 463 |
+
MAX_DISPLAY_ROWS = 100
|
| 464 |
+
MAX_STRING_LENGTH = 50
|
| 465 |
+
DEFAULT_DECIMAL_PLACES = 2
|
| 466 |
+
|
| 467 |
+
# Analysis settings
|
| 468 |
+
CORRELATION_THRESHOLD = 0.7
|
| 469 |
+
OUTLIER_ZSCORE_THRESHOLD = 3
|
| 470 |
+
MIN_SAMPLE_SIZE = 30
|
| 471 |
+
|
| 472 |
+
# Export settings
|
| 473 |
+
DEFAULT_EXPORT_FORMAT = 'csv'
|
| 474 |
+
EXPORT_TIMESTAMP_FORMAT = '%Y%m%d_%H%M%S'
|
| 475 |
+
|
| 476 |
+
|
| 477 |
+
if __name__ == "__main__":
|
| 478 |
+
# Example usage and testing
|
| 479 |
+
print("Utils module loaded successfully")
|
| 480 |
+
print(f"Supported formats: {Config.SUPPORTED_FILE_FORMATS}")
|
visualizations.py
ADDED
|
@@ -0,0 +1,760 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Visualizations Module for Business Intelligence Dashboard
|
| 3 |
+
|
| 4 |
+
This module handles all data visualization operations using Strategy Pattern.
|
| 5 |
+
Supports multiple chart types with flexible rendering backends.
|
| 6 |
+
|
| 7 |
+
Author: Craig
|
| 8 |
+
Date: December 2024
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import numpy as np
|
| 13 |
+
import matplotlib.pyplot as plt
|
| 14 |
+
import seaborn as sns
|
| 15 |
+
import plotly.express as px
|
| 16 |
+
import plotly.graph_objects as go
|
| 17 |
+
from typing import Union, List, Dict, Optional, Any, Tuple
|
| 18 |
+
from abc import ABC, abstractmethod
|
| 19 |
+
import logging
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
|
| 22 |
+
from utils import ColumnValidator, DataFrameValidator, format_number, Config
|
| 23 |
+
|
| 24 |
+
# Configure logging
|
| 25 |
+
logging.basicConfig(level=logging.INFO)
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
# Set style for matplotlib
|
| 29 |
+
plt.style.use('seaborn-v0_8-darkgrid')
|
| 30 |
+
sns.set_palette("husl")
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# ============================================================================
|
| 34 |
+
# STRATEGY PATTERN - Visualization Strategies
|
| 35 |
+
# Follows Open/Closed Principle and Strategy Pattern
|
| 36 |
+
# ============================================================================
|
| 37 |
+
|
| 38 |
+
class VisualizationStrategy(ABC):
|
| 39 |
+
"""
|
| 40 |
+
Abstract base class for visualization strategies.
|
| 41 |
+
Follows Strategy Pattern - allows different visualization algorithms.
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
@abstractmethod
|
| 45 |
+
def create(self, df: pd.DataFrame, **kwargs) -> Any:
|
| 46 |
+
"""
|
| 47 |
+
Create visualization.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
df: DataFrame to visualize
|
| 51 |
+
**kwargs: Additional parameters for visualization
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
Visualization object (matplotlib Figure or plotly Figure)
|
| 55 |
+
"""
|
| 56 |
+
pass
|
| 57 |
+
|
| 58 |
+
@abstractmethod
|
| 59 |
+
def get_required_params(self) -> List[str]:
|
| 60 |
+
"""
|
| 61 |
+
Get list of required parameters for this visualization.
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
List of required parameter names
|
| 65 |
+
"""
|
| 66 |
+
pass
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# ============================================================================
|
| 70 |
+
# TIME SERIES VISUALIZATIONS
|
| 71 |
+
# ============================================================================
|
| 72 |
+
|
| 73 |
+
class TimeSeriesPlot(VisualizationStrategy):
|
| 74 |
+
"""
|
| 75 |
+
Create time series line plots.
|
| 76 |
+
Follows Single Responsibility Principle - only handles time series plots.
|
| 77 |
+
"""
|
| 78 |
+
|
| 79 |
+
def get_required_params(self) -> List[str]:
|
| 80 |
+
"""Required parameters for time series plot."""
|
| 81 |
+
return ['date_column', 'value_column']
|
| 82 |
+
|
| 83 |
+
def create(self, df: pd.DataFrame, date_column: str, value_column: str,
|
| 84 |
+
title: str = "Time Series Plot",
|
| 85 |
+
aggregation: str = 'sum',
|
| 86 |
+
backend: str = 'matplotlib',
|
| 87 |
+
**kwargs) -> Any:
|
| 88 |
+
"""
|
| 89 |
+
Create time series plot.
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
df: DataFrame with time series data
|
| 93 |
+
date_column: Column containing dates
|
| 94 |
+
value_column: Column containing values to plot
|
| 95 |
+
title: Plot title
|
| 96 |
+
aggregation: Aggregation method ('sum', 'mean', 'count', 'median')
|
| 97 |
+
backend: Visualization backend ('matplotlib' or 'plotly')
|
| 98 |
+
**kwargs: Additional plotting parameters
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
matplotlib Figure or plotly Figure
|
| 102 |
+
"""
|
| 103 |
+
# Validate inputs
|
| 104 |
+
DataFrameValidator().validate(df)
|
| 105 |
+
ColumnValidator().validate(df, [date_column, value_column])
|
| 106 |
+
|
| 107 |
+
# Prepare data
|
| 108 |
+
df_plot = df.copy()
|
| 109 |
+
|
| 110 |
+
# Ensure date column is datetime
|
| 111 |
+
if not pd.api.types.is_datetime64_any_dtype(df_plot[date_column]):
|
| 112 |
+
df_plot[date_column] = pd.to_datetime(df_plot[date_column], errors='coerce')
|
| 113 |
+
|
| 114 |
+
# Remove rows with NaT dates
|
| 115 |
+
df_plot = df_plot.dropna(subset=[date_column])
|
| 116 |
+
|
| 117 |
+
# Sort by date
|
| 118 |
+
df_plot = df_plot.sort_values(date_column)
|
| 119 |
+
|
| 120 |
+
# Apply aggregation if needed
|
| 121 |
+
if aggregation != 'none':
|
| 122 |
+
df_plot = self._apply_aggregation(df_plot, date_column, value_column, aggregation)
|
| 123 |
+
|
| 124 |
+
# Create visualization based on backend
|
| 125 |
+
if backend == 'matplotlib':
|
| 126 |
+
return self._create_matplotlib(df_plot, date_column, value_column, title, aggregation)
|
| 127 |
+
elif backend == 'plotly':
|
| 128 |
+
return self._create_plotly(df_plot, date_column, value_column, title, aggregation)
|
| 129 |
+
else:
|
| 130 |
+
raise ValueError(f"Unsupported backend: {backend}")
|
| 131 |
+
|
| 132 |
+
def _apply_aggregation(self, df: pd.DataFrame, date_column: str,
|
| 133 |
+
value_column: str, aggregation: str) -> pd.DataFrame:
|
| 134 |
+
"""Apply aggregation to time series data."""
|
| 135 |
+
if aggregation == 'sum':
|
| 136 |
+
return df.groupby(date_column)[value_column].sum().reset_index()
|
| 137 |
+
elif aggregation == 'mean':
|
| 138 |
+
return df.groupby(date_column)[value_column].mean().reset_index()
|
| 139 |
+
elif aggregation == 'count':
|
| 140 |
+
return df.groupby(date_column)[value_column].count().reset_index()
|
| 141 |
+
elif aggregation == 'median':
|
| 142 |
+
return df.groupby(date_column)[value_column].median().reset_index()
|
| 143 |
+
else:
|
| 144 |
+
return df
|
| 145 |
+
|
| 146 |
+
def _create_matplotlib(self, df: pd.DataFrame, date_column: str,
|
| 147 |
+
value_column: str, title: str, aggregation: str):
|
| 148 |
+
"""Create matplotlib time series plot."""
|
| 149 |
+
fig, ax = plt.subplots(figsize=(12, 6))
|
| 150 |
+
|
| 151 |
+
ax.plot(df[date_column], df[value_column], marker='o', linewidth=2, markersize=4)
|
| 152 |
+
ax.set_xlabel(date_column, fontsize=12)
|
| 153 |
+
ax.set_ylabel(f"{value_column} ({aggregation})", fontsize=12)
|
| 154 |
+
ax.set_title(title, fontsize=14, fontweight='bold')
|
| 155 |
+
ax.grid(True, alpha=0.3)
|
| 156 |
+
|
| 157 |
+
# Rotate x-axis labels
|
| 158 |
+
plt.xticks(rotation=45, ha='right')
|
| 159 |
+
plt.tight_layout()
|
| 160 |
+
|
| 161 |
+
logger.info(f"Created matplotlib time series plot: {title}")
|
| 162 |
+
return fig
|
| 163 |
+
|
| 164 |
+
def _create_plotly(self, df: pd.DataFrame, date_column: str,
|
| 165 |
+
value_column: str, title: str, aggregation: str):
|
| 166 |
+
"""Create plotly time series plot."""
|
| 167 |
+
fig = px.line(df, x=date_column, y=value_column,
|
| 168 |
+
title=title,
|
| 169 |
+
labels={value_column: f"{value_column} ({aggregation})"})
|
| 170 |
+
|
| 171 |
+
fig.update_traces(mode='lines+markers')
|
| 172 |
+
fig.update_layout(
|
| 173 |
+
xaxis_title=date_column,
|
| 174 |
+
yaxis_title=f"{value_column} ({aggregation})",
|
| 175 |
+
hovermode='x unified',
|
| 176 |
+
template='plotly_white'
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
logger.info(f"Created plotly time series plot: {title}")
|
| 180 |
+
return fig
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
# ============================================================================
|
| 184 |
+
# DISTRIBUTION VISUALIZATIONS
|
| 185 |
+
# ============================================================================
|
| 186 |
+
|
| 187 |
+
class DistributionPlot(VisualizationStrategy):
|
| 188 |
+
"""
|
| 189 |
+
Create distribution plots (histogram, box plot, violin plot).
|
| 190 |
+
Follows Single Responsibility Principle - only handles distribution plots.
|
| 191 |
+
"""
|
| 192 |
+
|
| 193 |
+
def get_required_params(self) -> List[str]:
|
| 194 |
+
"""Required parameters for distribution plot."""
|
| 195 |
+
return ['column']
|
| 196 |
+
|
| 197 |
+
def create(self, df: pd.DataFrame, column: str,
|
| 198 |
+
plot_type: str = 'histogram',
|
| 199 |
+
title: str = "Distribution Plot",
|
| 200 |
+
bins: int = 30,
|
| 201 |
+
backend: str = 'matplotlib',
|
| 202 |
+
**kwargs) -> Any:
|
| 203 |
+
"""
|
| 204 |
+
Create distribution plot.
|
| 205 |
+
|
| 206 |
+
Args:
|
| 207 |
+
df: DataFrame with data
|
| 208 |
+
column: Column to visualize
|
| 209 |
+
plot_type: Type of plot ('histogram', 'box', 'violin')
|
| 210 |
+
title: Plot title
|
| 211 |
+
bins: Number of bins for histogram
|
| 212 |
+
backend: Visualization backend ('matplotlib' or 'plotly')
|
| 213 |
+
**kwargs: Additional plotting parameters
|
| 214 |
+
|
| 215 |
+
Returns:
|
| 216 |
+
matplotlib Figure or plotly Figure
|
| 217 |
+
"""
|
| 218 |
+
# Validate inputs
|
| 219 |
+
DataFrameValidator().validate(df)
|
| 220 |
+
ColumnValidator().validate(df, column)
|
| 221 |
+
|
| 222 |
+
# Remove NaN values
|
| 223 |
+
df_plot = df[column].dropna()
|
| 224 |
+
|
| 225 |
+
if len(df_plot) == 0:
|
| 226 |
+
raise ValueError(f"No valid data in column '{column}'")
|
| 227 |
+
|
| 228 |
+
# Create visualization based on backend
|
| 229 |
+
if backend == 'matplotlib':
|
| 230 |
+
return self._create_matplotlib(df_plot, column, plot_type, title, bins)
|
| 231 |
+
elif backend == 'plotly':
|
| 232 |
+
return self._create_plotly(df_plot, column, plot_type, title, bins)
|
| 233 |
+
else:
|
| 234 |
+
raise ValueError(f"Unsupported backend: {backend}")
|
| 235 |
+
|
| 236 |
+
def _create_matplotlib(self, data: pd.Series, column: str,
|
| 237 |
+
plot_type: str, title: str, bins: int):
|
| 238 |
+
"""Create matplotlib distribution plot."""
|
| 239 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 240 |
+
|
| 241 |
+
if plot_type == 'histogram':
|
| 242 |
+
ax.hist(data, bins=bins, edgecolor='black', alpha=0.7)
|
| 243 |
+
ax.set_ylabel('Frequency', fontsize=12)
|
| 244 |
+
|
| 245 |
+
elif plot_type == 'box':
|
| 246 |
+
ax.boxplot(data, vert=True)
|
| 247 |
+
ax.set_ylabel(column, fontsize=12)
|
| 248 |
+
|
| 249 |
+
elif plot_type == 'violin':
|
| 250 |
+
# Use seaborn for violin plot
|
| 251 |
+
sns.violinplot(y=data, ax=ax)
|
| 252 |
+
ax.set_ylabel(column, fontsize=12)
|
| 253 |
+
else:
|
| 254 |
+
raise ValueError(f"Unsupported plot type: {plot_type}")
|
| 255 |
+
|
| 256 |
+
ax.set_xlabel(column if plot_type == 'histogram' else '', fontsize=12)
|
| 257 |
+
ax.set_title(title, fontsize=14, fontweight='bold')
|
| 258 |
+
ax.grid(True, alpha=0.3, axis='y')
|
| 259 |
+
|
| 260 |
+
plt.tight_layout()
|
| 261 |
+
logger.info(f"Created matplotlib {plot_type} plot: {title}")
|
| 262 |
+
return fig
|
| 263 |
+
|
| 264 |
+
def _create_plotly(self, data: pd.Series, column: str,
|
| 265 |
+
plot_type: str, title: str, bins: int):
|
| 266 |
+
"""Create plotly distribution plot."""
|
| 267 |
+
if plot_type == 'histogram':
|
| 268 |
+
fig = px.histogram(data, x=data.values, nbins=bins, title=title,
|
| 269 |
+
labels={'x': column, 'y': 'Frequency'})
|
| 270 |
+
|
| 271 |
+
elif plot_type == 'box':
|
| 272 |
+
fig = px.box(y=data.values, title=title, labels={'y': column})
|
| 273 |
+
|
| 274 |
+
elif plot_type == 'violin':
|
| 275 |
+
fig = px.violin(y=data.values, title=title, labels={'y': column})
|
| 276 |
+
else:
|
| 277 |
+
raise ValueError(f"Unsupported plot type: {plot_type}")
|
| 278 |
+
|
| 279 |
+
fig.update_layout(template='plotly_white')
|
| 280 |
+
logger.info(f"Created plotly {plot_type} plot: {title}")
|
| 281 |
+
return fig
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
# ============================================================================
|
| 285 |
+
# CATEGORY VISUALIZATIONS
|
| 286 |
+
# ============================================================================
|
| 287 |
+
|
| 288 |
+
class CategoryPlot(VisualizationStrategy):
|
| 289 |
+
"""
|
| 290 |
+
Create category plots (bar chart, pie chart).
|
| 291 |
+
Follows Single Responsibility Principle - only handles category plots.
|
| 292 |
+
"""
|
| 293 |
+
|
| 294 |
+
def get_required_params(self) -> List[str]:
|
| 295 |
+
"""Required parameters for category plot."""
|
| 296 |
+
return ['column']
|
| 297 |
+
|
| 298 |
+
def create(self, df: pd.DataFrame, column: str,
|
| 299 |
+
value_column: Optional[str] = None,
|
| 300 |
+
plot_type: str = 'bar',
|
| 301 |
+
title: str = "Category Analysis",
|
| 302 |
+
aggregation: str = 'count',
|
| 303 |
+
top_n: Optional[int] = None,
|
| 304 |
+
backend: str = 'matplotlib',
|
| 305 |
+
**kwargs) -> Any:
|
| 306 |
+
"""
|
| 307 |
+
Create category plot.
|
| 308 |
+
|
| 309 |
+
Args:
|
| 310 |
+
df: DataFrame with data
|
| 311 |
+
column: Categorical column to visualize
|
| 312 |
+
value_column: Optional value column for aggregation
|
| 313 |
+
plot_type: Type of plot ('bar' or 'pie')
|
| 314 |
+
title: Plot title
|
| 315 |
+
aggregation: Aggregation method ('count', 'sum', 'mean', 'median')
|
| 316 |
+
top_n: Show only top N categories
|
| 317 |
+
backend: Visualization backend ('matplotlib' or 'plotly')
|
| 318 |
+
**kwargs: Additional plotting parameters
|
| 319 |
+
|
| 320 |
+
Returns:
|
| 321 |
+
matplotlib Figure or plotly Figure
|
| 322 |
+
"""
|
| 323 |
+
# Validate inputs
|
| 324 |
+
DataFrameValidator().validate(df)
|
| 325 |
+
ColumnValidator().validate(df, column)
|
| 326 |
+
|
| 327 |
+
if value_column:
|
| 328 |
+
ColumnValidator().validate(df, value_column)
|
| 329 |
+
|
| 330 |
+
# Prepare data
|
| 331 |
+
if value_column and aggregation != 'count':
|
| 332 |
+
# Aggregate by category
|
| 333 |
+
if aggregation == 'sum':
|
| 334 |
+
data = df.groupby(column)[value_column].sum()
|
| 335 |
+
elif aggregation == 'mean':
|
| 336 |
+
data = df.groupby(column)[value_column].mean()
|
| 337 |
+
elif aggregation == 'median':
|
| 338 |
+
data = df.groupby(column)[value_column].median()
|
| 339 |
+
else:
|
| 340 |
+
data = df[column].value_counts()
|
| 341 |
+
else:
|
| 342 |
+
# Simple count
|
| 343 |
+
data = df[column].value_counts()
|
| 344 |
+
|
| 345 |
+
# Get top N if specified
|
| 346 |
+
if top_n:
|
| 347 |
+
data = data.nlargest(top_n)
|
| 348 |
+
|
| 349 |
+
# Sort for better visualization
|
| 350 |
+
data = data.sort_values(ascending=False)
|
| 351 |
+
|
| 352 |
+
# Create visualization based on backend
|
| 353 |
+
if backend == 'matplotlib':
|
| 354 |
+
return self._create_matplotlib(data, column, plot_type, title, aggregation)
|
| 355 |
+
elif backend == 'plotly':
|
| 356 |
+
return self._create_plotly(data, column, plot_type, title, aggregation)
|
| 357 |
+
else:
|
| 358 |
+
raise ValueError(f"Unsupported backend: {backend}")
|
| 359 |
+
|
| 360 |
+
def _create_matplotlib(self, data: pd.Series, column: str,
|
| 361 |
+
plot_type: str, title: str, aggregation: str):
|
| 362 |
+
"""Create matplotlib category plot."""
|
| 363 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 364 |
+
|
| 365 |
+
if plot_type == 'bar':
|
| 366 |
+
bars = ax.bar(range(len(data)), data.values, edgecolor='black', alpha=0.7)
|
| 367 |
+
ax.set_xticks(range(len(data)))
|
| 368 |
+
ax.set_xticklabels(data.index, rotation=45, ha='right')
|
| 369 |
+
ax.set_xlabel(column, fontsize=12)
|
| 370 |
+
ax.set_ylabel(f'Value ({aggregation})', fontsize=12)
|
| 371 |
+
|
| 372 |
+
# Add value labels on bars
|
| 373 |
+
for i, (idx, val) in enumerate(data.items()):
|
| 374 |
+
ax.text(i, val, format_number(val), ha='center', va='bottom')
|
| 375 |
+
|
| 376 |
+
elif plot_type == 'pie':
|
| 377 |
+
wedges, texts, autotexts = ax.pie(data.values, labels=data.index,
|
| 378 |
+
autopct='%1.1f%%', startangle=90)
|
| 379 |
+
# Make percentage text more readable
|
| 380 |
+
for autotext in autotexts:
|
| 381 |
+
autotext.set_color('white')
|
| 382 |
+
autotext.set_fontweight('bold')
|
| 383 |
+
else:
|
| 384 |
+
raise ValueError(f"Unsupported plot type: {plot_type}")
|
| 385 |
+
|
| 386 |
+
ax.set_title(title, fontsize=14, fontweight='bold')
|
| 387 |
+
plt.tight_layout()
|
| 388 |
+
|
| 389 |
+
logger.info(f"Created matplotlib {plot_type} plot: {title}")
|
| 390 |
+
return fig
|
| 391 |
+
|
| 392 |
+
def _create_plotly(self, data: pd.Series, column: str,
|
| 393 |
+
plot_type: str, title: str, aggregation: str):
|
| 394 |
+
"""Create plotly category plot."""
|
| 395 |
+
if plot_type == 'bar':
|
| 396 |
+
fig = px.bar(x=data.index, y=data.values, title=title,
|
| 397 |
+
labels={'x': column, 'y': f'Value ({aggregation})'})
|
| 398 |
+
fig.update_traces(text=data.values, textposition='outside')
|
| 399 |
+
|
| 400 |
+
elif plot_type == 'pie':
|
| 401 |
+
fig = px.pie(values=data.values, names=data.index, title=title)
|
| 402 |
+
else:
|
| 403 |
+
raise ValueError(f"Unsupported plot type: {plot_type}")
|
| 404 |
+
|
| 405 |
+
fig.update_layout(template='plotly_white')
|
| 406 |
+
logger.info(f"Created plotly {plot_type} plot: {title}")
|
| 407 |
+
return fig
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
# ============================================================================
|
| 411 |
+
# RELATIONSHIP VISUALIZATIONS
|
| 412 |
+
# ============================================================================
|
| 413 |
+
|
| 414 |
+
class ScatterPlot(VisualizationStrategy):
|
| 415 |
+
"""
|
| 416 |
+
Create scatter plots to show relationships between variables.
|
| 417 |
+
Follows Single Responsibility Principle - only handles scatter plots.
|
| 418 |
+
"""
|
| 419 |
+
|
| 420 |
+
def get_required_params(self) -> List[str]:
|
| 421 |
+
"""Required parameters for scatter plot."""
|
| 422 |
+
return ['x_column', 'y_column']
|
| 423 |
+
|
| 424 |
+
def create(self, df: pd.DataFrame, x_column: str, y_column: str,
|
| 425 |
+
title: str = "Scatter Plot",
|
| 426 |
+
color_column: Optional[str] = None,
|
| 427 |
+
size_column: Optional[str] = None,
|
| 428 |
+
show_trend: bool = False,
|
| 429 |
+
backend: str = 'matplotlib',
|
| 430 |
+
**kwargs) -> Any:
|
| 431 |
+
"""
|
| 432 |
+
Create scatter plot.
|
| 433 |
+
|
| 434 |
+
Args:
|
| 435 |
+
df: DataFrame with data
|
| 436 |
+
x_column: Column for x-axis
|
| 437 |
+
y_column: Column for y-axis
|
| 438 |
+
title: Plot title
|
| 439 |
+
color_column: Optional column for color coding
|
| 440 |
+
size_column: Optional column for point sizes
|
| 441 |
+
show_trend: Whether to show trend line
|
| 442 |
+
backend: Visualization backend ('matplotlib' or 'plotly')
|
| 443 |
+
**kwargs: Additional plotting parameters
|
| 444 |
+
|
| 445 |
+
Returns:
|
| 446 |
+
matplotlib Figure or plotly Figure
|
| 447 |
+
"""
|
| 448 |
+
# Validate inputs
|
| 449 |
+
DataFrameValidator().validate(df)
|
| 450 |
+
ColumnValidator().validate(df, [x_column, y_column])
|
| 451 |
+
|
| 452 |
+
if color_column:
|
| 453 |
+
ColumnValidator().validate(df, color_column)
|
| 454 |
+
if size_column:
|
| 455 |
+
ColumnValidator().validate(df, size_column)
|
| 456 |
+
|
| 457 |
+
# Remove rows with NaN in required columns
|
| 458 |
+
required_cols = [x_column, y_column]
|
| 459 |
+
if color_column:
|
| 460 |
+
required_cols.append(color_column)
|
| 461 |
+
if size_column:
|
| 462 |
+
required_cols.append(size_column)
|
| 463 |
+
|
| 464 |
+
df_plot = df[required_cols].dropna()
|
| 465 |
+
|
| 466 |
+
if len(df_plot) == 0:
|
| 467 |
+
raise ValueError("No valid data after removing NaN values")
|
| 468 |
+
|
| 469 |
+
# Create visualization based on backend
|
| 470 |
+
if backend == 'matplotlib':
|
| 471 |
+
return self._create_matplotlib(df_plot, x_column, y_column, title,
|
| 472 |
+
color_column, size_column, show_trend)
|
| 473 |
+
elif backend == 'plotly':
|
| 474 |
+
return self._create_plotly(df_plot, x_column, y_column, title,
|
| 475 |
+
color_column, size_column, show_trend)
|
| 476 |
+
else:
|
| 477 |
+
raise ValueError(f"Unsupported backend: {backend}")
|
| 478 |
+
|
| 479 |
+
def _create_matplotlib(self, df: pd.DataFrame, x_column: str, y_column: str,
|
| 480 |
+
title: str, color_column: Optional[str],
|
| 481 |
+
size_column: Optional[str], show_trend: bool):
|
| 482 |
+
"""Create matplotlib scatter plot."""
|
| 483 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 484 |
+
|
| 485 |
+
# Prepare scatter parameters
|
| 486 |
+
scatter_kwargs = {'alpha': 0.6, 'edgecolors': 'black', 'linewidth': 0.5}
|
| 487 |
+
|
| 488 |
+
if size_column:
|
| 489 |
+
scatter_kwargs['s'] = df[size_column]
|
| 490 |
+
else:
|
| 491 |
+
scatter_kwargs['s'] = 50
|
| 492 |
+
|
| 493 |
+
if color_column:
|
| 494 |
+
# Check if color column is categorical (string type)
|
| 495 |
+
if df[color_column].dtype == 'object' or pd.api.types.is_categorical_dtype(df[color_column]):
|
| 496 |
+
# Convert categorical to numerical codes for matplotlib
|
| 497 |
+
categories = df[color_column].astype('category')
|
| 498 |
+
color_codes = categories.cat.codes
|
| 499 |
+
scatter = ax.scatter(df[x_column], df[y_column], c=color_codes,
|
| 500 |
+
cmap='viridis', **scatter_kwargs)
|
| 501 |
+
# Create custom legend
|
| 502 |
+
handles = []
|
| 503 |
+
for i, cat in enumerate(categories.cat.categories):
|
| 504 |
+
handles.append(plt.Line2D([0], [0], marker='o', color='w',
|
| 505 |
+
markerfacecolor=plt.cm.viridis(i / len(categories.cat.categories)),
|
| 506 |
+
markersize=8, label=cat))
|
| 507 |
+
ax.legend(handles=handles, title=color_column)
|
| 508 |
+
else:
|
| 509 |
+
# Numerical color column
|
| 510 |
+
scatter = ax.scatter(df[x_column], df[y_column], c=df[color_column],
|
| 511 |
+
cmap='viridis', **scatter_kwargs)
|
| 512 |
+
plt.colorbar(scatter, ax=ax, label=color_column)
|
| 513 |
+
else:
|
| 514 |
+
ax.scatter(df[x_column], df[y_column], **scatter_kwargs)
|
| 515 |
+
|
| 516 |
+
# Add trend line if requested
|
| 517 |
+
if show_trend:
|
| 518 |
+
z = np.polyfit(df[x_column], df[y_column], 1)
|
| 519 |
+
p = np.poly1d(z)
|
| 520 |
+
ax.plot(df[x_column], p(df[x_column]), "r--", alpha=0.8, label='Trend')
|
| 521 |
+
ax.legend()
|
| 522 |
+
|
| 523 |
+
ax.set_xlabel(x_column, fontsize=12)
|
| 524 |
+
ax.set_ylabel(y_column, fontsize=12)
|
| 525 |
+
ax.set_title(title, fontsize=14, fontweight='bold')
|
| 526 |
+
ax.grid(True, alpha=0.3)
|
| 527 |
+
|
| 528 |
+
plt.tight_layout()
|
| 529 |
+
logger.info(f"Created matplotlib scatter plot: {title}")
|
| 530 |
+
return fig
|
| 531 |
+
|
| 532 |
+
def _create_plotly(self, df: pd.DataFrame, x_column: str, y_column: str,
|
| 533 |
+
title: str, color_column: Optional[str],
|
| 534 |
+
size_column: Optional[str], show_trend: bool):
|
| 535 |
+
"""Create plotly scatter plot."""
|
| 536 |
+
fig = px.scatter(df, x=x_column, y=y_column,
|
| 537 |
+
color=color_column, size=size_column,
|
| 538 |
+
title=title,
|
| 539 |
+
trendline='ols' if show_trend else None)
|
| 540 |
+
|
| 541 |
+
fig.update_layout(template='plotly_white')
|
| 542 |
+
logger.info(f"Created plotly scatter plot: {title}")
|
| 543 |
+
return fig
|
| 544 |
+
|
| 545 |
+
|
| 546 |
+
class CorrelationHeatmap(VisualizationStrategy):
|
| 547 |
+
"""
|
| 548 |
+
Create correlation heatmap for numerical variables.
|
| 549 |
+
Follows Single Responsibility Principle - only handles correlation heatmaps.
|
| 550 |
+
"""
|
| 551 |
+
|
| 552 |
+
def get_required_params(self) -> List[str]:
|
| 553 |
+
"""Required parameters for correlation heatmap."""
|
| 554 |
+
return [] # Uses all numerical columns by default
|
| 555 |
+
|
| 556 |
+
def create(self, df: pd.DataFrame,
|
| 557 |
+
columns: Optional[List[str]] = None,
|
| 558 |
+
title: str = "Correlation Heatmap",
|
| 559 |
+
method: str = 'pearson',
|
| 560 |
+
backend: str = 'matplotlib',
|
| 561 |
+
**kwargs) -> Any:
|
| 562 |
+
"""
|
| 563 |
+
Create correlation heatmap.
|
| 564 |
+
|
| 565 |
+
Args:
|
| 566 |
+
df: DataFrame with data
|
| 567 |
+
columns: Optional list of columns to include
|
| 568 |
+
title: Plot title
|
| 569 |
+
method: Correlation method ('pearson', 'spearman', 'kendall')
|
| 570 |
+
backend: Visualization backend ('matplotlib' or 'plotly')
|
| 571 |
+
**kwargs: Additional plotting parameters
|
| 572 |
+
|
| 573 |
+
Returns:
|
| 574 |
+
matplotlib Figure or plotly Figure
|
| 575 |
+
"""
|
| 576 |
+
# Validate inputs
|
| 577 |
+
DataFrameValidator().validate(df)
|
| 578 |
+
|
| 579 |
+
# Select numerical columns
|
| 580 |
+
if columns:
|
| 581 |
+
ColumnValidator().validate(df, columns)
|
| 582 |
+
df_corr = df[columns].select_dtypes(include=[np.number])
|
| 583 |
+
else:
|
| 584 |
+
df_corr = df.select_dtypes(include=[np.number])
|
| 585 |
+
|
| 586 |
+
if df_corr.shape[1] < 2:
|
| 587 |
+
raise ValueError("Need at least 2 numerical columns for correlation heatmap")
|
| 588 |
+
|
| 589 |
+
# Calculate correlation
|
| 590 |
+
corr_matrix = df_corr.corr(method=method)
|
| 591 |
+
|
| 592 |
+
# Create visualization based on backend
|
| 593 |
+
if backend == 'matplotlib':
|
| 594 |
+
return self._create_matplotlib(corr_matrix, title)
|
| 595 |
+
elif backend == 'plotly':
|
| 596 |
+
return self._create_plotly(corr_matrix, title)
|
| 597 |
+
else:
|
| 598 |
+
raise ValueError(f"Unsupported backend: {backend}")
|
| 599 |
+
|
| 600 |
+
def _create_matplotlib(self, corr_matrix: pd.DataFrame, title: str):
|
| 601 |
+
"""Create matplotlib correlation heatmap."""
|
| 602 |
+
fig, ax = plt.subplots(figsize=(10, 8))
|
| 603 |
+
|
| 604 |
+
# Create heatmap
|
| 605 |
+
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm',
|
| 606 |
+
center=0, square=True, linewidths=1,
|
| 607 |
+
cbar_kws={"shrink": 0.8}, ax=ax)
|
| 608 |
+
|
| 609 |
+
ax.set_title(title, fontsize=14, fontweight='bold')
|
| 610 |
+
plt.tight_layout()
|
| 611 |
+
|
| 612 |
+
logger.info(f"Created matplotlib correlation heatmap: {title}")
|
| 613 |
+
return fig
|
| 614 |
+
|
| 615 |
+
def _create_plotly(self, corr_matrix: pd.DataFrame, title: str):
|
| 616 |
+
"""Create plotly correlation heatmap."""
|
| 617 |
+
fig = px.imshow(corr_matrix,
|
| 618 |
+
text_auto='.2f',
|
| 619 |
+
color_continuous_scale='RdBu_r',
|
| 620 |
+
title=title,
|
| 621 |
+
aspect='auto')
|
| 622 |
+
|
| 623 |
+
fig.update_layout(template='plotly_white')
|
| 624 |
+
logger.info(f"Created plotly correlation heatmap: {title}")
|
| 625 |
+
return fig
|
| 626 |
+
|
| 627 |
+
|
| 628 |
+
# ============================================================================
|
| 629 |
+
# VISUALIZATION MANAGER
|
| 630 |
+
# Uses Strategy Pattern to manage different visualization types
|
| 631 |
+
# ============================================================================
|
| 632 |
+
|
| 633 |
+
class VisualizationManager:
|
| 634 |
+
"""
|
| 635 |
+
Manager class for visualizations using Strategy Pattern.
|
| 636 |
+
Follows Open/Closed Principle - open for extension, closed for modification.
|
| 637 |
+
"""
|
| 638 |
+
|
| 639 |
+
def __init__(self):
|
| 640 |
+
"""Initialize VisualizationManager with all available strategies."""
|
| 641 |
+
self.strategies: Dict[str, VisualizationStrategy] = {
|
| 642 |
+
'time_series': TimeSeriesPlot(),
|
| 643 |
+
'distribution': DistributionPlot(),
|
| 644 |
+
'category': CategoryPlot(),
|
| 645 |
+
'scatter': ScatterPlot(),
|
| 646 |
+
'correlation': CorrelationHeatmap()
|
| 647 |
+
}
|
| 648 |
+
|
| 649 |
+
def create_visualization(self, viz_type: str, df: pd.DataFrame, **kwargs) -> Any:
|
| 650 |
+
"""
|
| 651 |
+
Create visualization using specified strategy.
|
| 652 |
+
|
| 653 |
+
Args:
|
| 654 |
+
viz_type: Type of visualization ('time_series', 'distribution', etc.)
|
| 655 |
+
df: DataFrame to visualize
|
| 656 |
+
**kwargs: Parameters specific to visualization type
|
| 657 |
+
|
| 658 |
+
Returns:
|
| 659 |
+
Visualization object
|
| 660 |
+
|
| 661 |
+
Raises:
|
| 662 |
+
ValueError: If visualization type is not supported
|
| 663 |
+
"""
|
| 664 |
+
if viz_type not in self.strategies:
|
| 665 |
+
raise ValueError(
|
| 666 |
+
f"Unsupported visualization type: {viz_type}. "
|
| 667 |
+
f"Available types: {list(self.strategies.keys())}"
|
| 668 |
+
)
|
| 669 |
+
|
| 670 |
+
strategy = self.strategies[viz_type]
|
| 671 |
+
return strategy.create(df, **kwargs)
|
| 672 |
+
|
| 673 |
+
def add_strategy(self, name: str, strategy: VisualizationStrategy) -> None:
|
| 674 |
+
"""
|
| 675 |
+
Add new visualization strategy.
|
| 676 |
+
Follows Open/Closed Principle - extend functionality without modifying existing code.
|
| 677 |
+
|
| 678 |
+
Args:
|
| 679 |
+
name: Name for the strategy
|
| 680 |
+
strategy: Visualization strategy instance
|
| 681 |
+
"""
|
| 682 |
+
self.strategies[name] = strategy
|
| 683 |
+
logger.info(f"Added new visualization strategy: {name}")
|
| 684 |
+
|
| 685 |
+
def get_available_visualizations(self) -> List[str]:
|
| 686 |
+
"""
|
| 687 |
+
Get list of available visualization types.
|
| 688 |
+
|
| 689 |
+
Returns:
|
| 690 |
+
List of visualization type names
|
| 691 |
+
"""
|
| 692 |
+
return list(self.strategies.keys())
|
| 693 |
+
|
| 694 |
+
def get_required_params(self, viz_type: str) -> List[str]:
|
| 695 |
+
"""
|
| 696 |
+
Get required parameters for a visualization type.
|
| 697 |
+
|
| 698 |
+
Args:
|
| 699 |
+
viz_type: Type of visualization
|
| 700 |
+
|
| 701 |
+
Returns:
|
| 702 |
+
List of required parameter names
|
| 703 |
+
"""
|
| 704 |
+
if viz_type not in self.strategies:
|
| 705 |
+
raise ValueError(f"Unsupported visualization type: {viz_type}")
|
| 706 |
+
|
| 707 |
+
return self.strategies[viz_type].get_required_params()
|
| 708 |
+
|
| 709 |
+
|
| 710 |
+
# ============================================================================
|
| 711 |
+
# UTILITY FUNCTIONS FOR SAVING VISUALIZATIONS
|
| 712 |
+
# ============================================================================
|
| 713 |
+
|
| 714 |
+
def save_visualization(fig: Any, filepath: Union[str, Path],
|
| 715 |
+
dpi: int = 300, format: str = 'png') -> bool:
|
| 716 |
+
"""
|
| 717 |
+
Save visualization to file.
|
| 718 |
+
|
| 719 |
+
Args:
|
| 720 |
+
fig: Matplotlib or Plotly figure
|
| 721 |
+
filepath: Path to save file
|
| 722 |
+
dpi: DPI for raster formats
|
| 723 |
+
format: File format ('png', 'jpg', 'pdf', 'svg', 'html')
|
| 724 |
+
|
| 725 |
+
Returns:
|
| 726 |
+
bool: True if saved successfully
|
| 727 |
+
"""
|
| 728 |
+
try:
|
| 729 |
+
filepath = Path(filepath)
|
| 730 |
+
|
| 731 |
+
# Handle matplotlib figures
|
| 732 |
+
if hasattr(fig, 'savefig'):
|
| 733 |
+
fig.savefig(filepath, dpi=dpi, bbox_inches='tight', format=format)
|
| 734 |
+
logger.info(f"Saved matplotlib figure to {filepath}")
|
| 735 |
+
|
| 736 |
+
# Handle plotly figures
|
| 737 |
+
elif hasattr(fig, 'write_image') or hasattr(fig, 'write_html'):
|
| 738 |
+
if format in ['png', 'jpg', 'pdf', 'svg']:
|
| 739 |
+
fig.write_image(filepath, format=format)
|
| 740 |
+
elif format == 'html':
|
| 741 |
+
fig.write_html(filepath)
|
| 742 |
+
logger.info(f"Saved plotly figure to {filepath}")
|
| 743 |
+
|
| 744 |
+
else:
|
| 745 |
+
raise ValueError("Unknown figure type")
|
| 746 |
+
|
| 747 |
+
return True
|
| 748 |
+
|
| 749 |
+
except Exception as e:
|
| 750 |
+
logger.error(f"Error saving visualization: {e}")
|
| 751 |
+
return False
|
| 752 |
+
|
| 753 |
+
|
| 754 |
+
if __name__ == "__main__":
|
| 755 |
+
# Example usage
|
| 756 |
+
print("Visualizations module loaded successfully")
|
| 757 |
+
|
| 758 |
+
# Demonstrate available visualizations
|
| 759 |
+
manager = VisualizationManager()
|
| 760 |
+
print(f"Available visualizations: {manager.get_available_visualizations()}")
|