| | import pandas as pd |
| | import numpy as np |
| | from transformers import GPT2Tokenizer, GPT2Model |
| | from sklearn.preprocessing import MultiLabelBinarizer |
| | from torch import nn |
| | import torch |
| | import openai |
| | from collections import Counter |
| | import nltk |
| | from nltk.corpus import stopwords |
| | from nltk.tokenize import word_tokenize |
| |
|
| | class GenreClassifier(nn.Module): |
| | def __init__(self, num_genres=20): |
| | super().__init__() |
| | self.gpt2 = GPT2Model.from_pretrained('gpt2') |
| | self.dropout = nn.Dropout(0.1) |
| | self.genre_classifier = nn.Linear(768, num_genres) |
| | self.sigmoid = nn.Sigmoid() |
| | |
| | def forward(self, input_ids, attention_mask): |
| | outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask) |
| | pooled_output = outputs[0].mean(dim=1) |
| | pooled_output = self.dropout(pooled_output) |
| | genre_logits = self.genre_classifier(pooled_output) |
| | return self.sigmoid(genre_logits) |
| |
|
| | class BookGenreAnalyzer: |
| | def __init__(self, api_key): |
| | """Initialize the analyzer with OpenAI API key""" |
| | self.openai.api_key = api_key |
| | self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') |
| | self.model = GenreClassifier() |
| | self.genre_labels = self._load_genre_labels() |
| | nltk.download('punkt') |
| | nltk.download('stopwords') |
| | self.stop_words = set(stopwords.words('english')) |
| | |
| | def _load_genre_labels(self): |
| | """Load predefined genre labels""" |
| | |
| | return [ |
| | "Fiction", "Non-fiction", "Mystery", "Romance", "Science Fiction", |
| | "Fantasy", "Thriller", "Horror", "Historical Fiction", "Biography", |
| | "Self-help", "Business", "Science", "Philosophy", "Poetry", |
| | "Drama", "Adventure", "Literary Fiction", "Young Adult", "Children's" |
| | ] |
| | |
| | def preprocess_text(self, text): |
| | """Preprocess the book text""" |
| | |
| | tokens = word_tokenize(text.lower()) |
| | tokens = [t for t in tokens if t not in self.stop_words] |
| | |
| | |
| | encodings = self.tokenizer( |
| | ' '.join(tokens), |
| | truncation=True, |
| | max_length=1024, |
| | padding='max_length', |
| | return_tensors='pt' |
| | ) |
| | return encodings |
| | |
| | def extract_features(self, text): |
| | """Extract relevant features from the text""" |
| | encodings = self.preprocess_text(text) |
| | with torch.no_grad(): |
| | features = self.model( |
| | input_ids=encodings['input_ids'], |
| | attention_mask=encodings['attention_mask'] |
| | ) |
| | return features |
| | |
| | def fine_tune_with_gpt3(self, training_data): |
| | """Fine-tune the model using GPT-3""" |
| | |
| | formatted_data = [] |
| | for book_text, genres in training_data: |
| | formatted_data.append({ |
| | "prompt": f"Book text: {book_text[:1000]}...\nGenres:", |
| | "completion": f" {', '.join(genres)}" |
| | }) |
| | |
| | |
| | try: |
| | response = openai.FineTune.create( |
| | training_file=self._upload_training_data(formatted_data), |
| | model="gpt-3", |
| | n_epochs=3, |
| | batch_size=4, |
| | learning_rate_multiplier=0.1 |
| | ) |
| | return response |
| | except Exception as e: |
| | print(f"Fine-tuning error: {e}") |
| | return None |
| | |
| | def _upload_training_data(self, formatted_data): |
| | """Upload training data to OpenAI""" |
| | import json |
| | with open('training_data.jsonl', 'w') as f: |
| | for entry in formatted_data: |
| | json.dump(entry, f) |
| | f.write('\n') |
| | |
| | with open('training_data.jsonl', 'rb') as f: |
| | response = openai.File.create( |
| | file=f, |
| | purpose='fine-tune' |
| | ) |
| | return response.id |
| | |
| | def analyze_book(self, book_text): |
| | """Analyze a book and return top 20 genres with confidence scores""" |
| | |
| | features = self.extract_features(book_text) |
| | predictions = features.numpy()[0] |
| | |
| | |
| | try: |
| | response = openai.Completion.create( |
| | model="gpt-3", |
| | prompt=f"Book text: {book_text[:1000]}...\nGenres:", |
| | max_tokens=100, |
| | temperature=0.3 |
| | ) |
| | gpt3_genres = response.choices[0].text.strip().split(', ') |
| | except: |
| | gpt3_genres = [] |
| | |
| | |
| | genres_with_scores = [ |
| | (genre, float(score)) |
| | for genre, score in zip(self.genre_labels, predictions) |
| | ] |
| | |
| | |
| | for genre, score in genres_with_scores: |
| | if genre in gpt3_genres: |
| | score *= 1.2 |
| | |
| | |
| | return sorted(genres_with_scores, key=lambda x: x[1], reverse=True)[:20] |
| |
|
| | |
| | def main(): |
| | |
| | analyzer = BookGenreAnalyzer('your-api-key') |
| | |
| | |
| | book_text = """ |
| | [Your book text here] |
| | """ |
| | |
| | |
| | genres = analyzer.analyze_book(book_text) |
| | |
| | |
| | print("\nTop 20 Genres:") |
| | for genre, confidence in genres: |
| | print(f"{genre}: {confidence:.2%}") |
| | |
| | |
| | training_data = [ |
| | ("Book 1 text...", ["Mystery", "Thriller"]), |
| | ("Book 2 text...", ["Science Fiction", "Adventure"]), |
| | |
| | ] |
| | |
| | fine_tune_response = analyzer.fine_tune_with_gpt3(training_data) |
| | if fine_tune_response: |
| | print("\nFine-tuning job created successfully!") |
| |
|
| | if __name__ == "__main__": |
| | main() |