{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "305b0a04", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pprint import pprint\n", "from nltk.corpus import stopwords\n", "from gensim.utils import simple_preprocess\n", "from gensim.models import Phrases, LdaModel\n", "from gensim.models.phrases import Phraser\n", "from gensim import corpora\n", "from gensim.models.coherencemodel import CoherenceModel" ] }, { "cell_type": "code", "execution_count": 2, "id": "b44f0e77", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\vldth\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "nltk.download('stopwords')" ] }, { "cell_type": "code", "execution_count": 3, "id": "20eb23e6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original dataset:\n", " review sentiment\n", "0 at first gumagana cya..pero pagnalowbat cya nd... 1\n", "1 grabi pangalawa ko ng order sa shapee pero pur... 1\n", "2 2l gray/black order ko. bakit 850ml lang po pi... 1\n", "3 walang silbing product.. bwesit. di gumagana d... 1\n", "4 d po maganda naman po yung neck fan, pero po n... 4\n" ] } ], "source": [ "# Load the dataset\n", "df = pd.read_csv('SentiTaglish_ProductsAndServices.csv')\n", "print(\"Original dataset:\")\n", "print(df.head())" ] }, { "cell_type": "code", "execution_count": 4, "id": "04ac2df1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " review\n", "0 at first gumagana cya..pero pagnalowbat cya nd...\n", "1 grabi pangalawa ko ng order sa shapee pero pur...\n", "2 2l gray/black order ko. bakit 850ml lang po pi...\n", "3 walang silbing product.. bwesit. di gumagana d...\n", "4 d po maganda naman po yung neck fan, pero po n...\n" ] } ], "source": [ "# Drop the sentiment column\n", "reviews_df = df.drop(columns=['sentiment'])\n", "print(reviews_df.head())" ] }, { "cell_type": "code", "execution_count": 5, "id": "4ec261ac", "metadata": {}, "outputs": [], "source": [ "documents = reviews_df['review'].astype(str).tolist()" ] }, { "cell_type": "code", "execution_count": 6, "id": "001e98d7", "metadata": {}, "outputs": [], "source": [ "#load tagalog stopwords function\n", "def load_stopwords(filepath):\n", " with open(filepath, 'r', encoding='utf-8') as file:\n", " return set(line.strip() for line in file if line.strip())" ] }, { "cell_type": "code", "execution_count": 7, "id": "01e67728", "metadata": {}, "outputs": [], "source": [ "# Define stopwords\n", "english_stopwords = stopwords.words('english')\n", "\n", "# Tagalog/Filipino stopwords \n", "tagalog_stopwords = load_stopwords(\"stopwords-new.txt\")\n", "\n", "combined_stopwords = set(english_stopwords).union(tagalog_stopwords)" ] }, { "cell_type": "code", "execution_count": 8, "id": "215e29fa", "metadata": {}, "outputs": [], "source": [ "# Preprocessing function\n", "def preprocess_data(documents):\n", " return [\n", " [word for word in simple_preprocess(str(doc)) if word not in combined_stopwords]\n", " for doc in documents\n", " ]" ] }, { "cell_type": "code", "execution_count": 9, "id": "b3c0885c", "metadata": {}, "outputs": [], "source": [ "# 1. Preprocess and tokenize your documents\n", "processed_texts = preprocess_data(documents) # Should return list of tokenized docs\n", "\n", "# 2. Create bigram and trigram models\n", "bigram = Phrases(processed_texts, min_count=3, threshold=5)\n", "trigram = Phrases(bigram[processed_texts], threshold=5)\n", "\n", "# 3. Convert to efficient Phrasers\n", "bigram_mod = Phraser(bigram)\n", "trigram_mod = Phraser(trigram)\n", "\n", "# 4. Apply phrase models\n", "def make_ngrams(texts):\n", " bigram_texts = [bigram_mod[doc] for doc in texts]\n", " trigram_texts = [trigram_mod[doc] for doc in bigram_texts]\n", " return trigram_texts\n", "\n", "ngrammed_texts = make_ngrams(processed_texts)\n", "\n", "# 5. Join tokens back into strings\n", "texts_for_bertopic = [' '.join(doc) for doc in ngrammed_texts]" ] }, { "cell_type": "code", "execution_count": 10, "id": "f6f4f0b1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BERTopic is working!\n" ] } ], "source": [ "from bertopic import BERTopic\n", "print(\"BERTopic is working!\")" ] }, { "cell_type": "code", "execution_count": 11, "id": "9ac0a678", "metadata": {}, "outputs": [], "source": [ "# Initialize BERTopic\n", "topic_model = BERTopic(language=\"multilingual\")\n", "\n", "# Fit the model on the preprocessed texts\n", "topics, probs = topic_model.fit_transform(texts_for_bertopic)" ] }, { "cell_type": "code", "execution_count": 12, "id": "f0856aac", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Topic | \n", "Count | \n", "Name | \n", "Representation | \n", "Representative_Docs | \n", "
|---|---|---|---|---|---|
| 0 | \n", "-1 | \n", "3614 | \n", "-1_item_seller_product_kasi | \n", "[item, seller, product, kasi, order, maganda, ... | \n", "[maganda mura thank order next_time, ganda ite... | \n", "
| 1 | \n", "0 | \n", "993 | \n", "0_color_black_white_pink | \n", "[color, black, white, pink, blue, green, wrong... | \n", "[disappointed wrong_color purple pink, pink_bl... | \n", "
| 2 | \n", "1 | \n", "514 | \n", "1_size_maliit_size_sizes_add_size | \n", "[size, maliit_size, sizes, add_size, maliit, l... | \n", "[wrong_size binigay order size size binigay, c... | \n", "
| 3 | \n", "2 | \n", "463 | \n", "2_dumating_sira_lng_ok | \n", "[dumating, sira, lng, ok, agad, sana, maganda,... | \n", "[sobrang nakakadismaya basta maka deliver lng ... | \n", "
| 4 | \n", "3 | \n", "336 | \n", "3_price_worth_worth_price_good_price | \n", "[price, worth, worth_price, good_price, sakto_... | \n", "[maganda worth_price, ganda worth, okay lng pr... | \n", "
| 5 | \n", "4 | \n", "194 | \n", "4_food_place_masarap_service | \n", "[food, place, masarap, service, staff, rice, c... | \n", "[service good energy crew poor took mins food ... | \n", "
| 6 | \n", "5 | \n", "181 | \n", "5_order_shop_thank_seller_order_received | \n", "[order, shop, thank_seller, order_received, or... | \n", "[goods order_received maganda gumagana thank_s... | \n", "
| 7 | \n", "6 | \n", "175 | \n", "6_damage_item_damaged_items | \n", "[damage, item, damaged, items, box, product, t... | \n", "[ganda damage thankyou_seller, damage dumating... | \n", "
| 8 | \n", "7 | \n", "158 | \n", "7_good_quality_good_maganda_quality_quality | \n", "[good_quality, good, maganda_quality, quality,... | \n", "[good_quality ganda, good_quality, maganda goo... | \n", "
| 9 | \n", "8 | \n", "156 | \n", "8_shoes_sandals_socks_boots | \n", "[shoes, sandals, socks, boots, slippers, legs,... | \n", "[maganda shoes price see_get, maganda shoes se... | \n", "
| 10 | \n", "9 | \n", "141 | \n", "9_rider_ganda_ganda_kuya_rider_thank_seller | \n", "[rider, ganda_ganda, kuya_rider, thank_seller,... | \n", "[sakto thanks_seller rider well_packaged, than... | \n", "
| 11 | \n", "10 | \n", "132 | \n", "10_bubble_wrap_naka_bubble_wrap_box_fragile | \n", "[bubble_wrap, naka_bubble_wrap, box, fragile, ... | \n", "[requested bubble_wrap nasa plastic buti nasir... | \n", "
| 12 | \n", "11 | \n", "129 | \n", "11_order_order_ulit_nd_order_order_received | \n", "[order, order_ulit, nd_order, order_received, ... | \n", "[poor nasunod order st try mali agad wag_umord... | \n", "
| 13 | \n", "12 | \n", "108 | \n", "12_refund_return_sayang_pera_return_refund | \n", "[refund, return, sayang_pera, return_refund, p... | \n", "[good dumating sira dami process refund, nakak... | \n", "
| 14 | \n", "13 | \n", "106 | \n", "13_battery_rechargeable_battery_life_gumagana | \n", "[battery, rechargeable, battery_life, gumagana... | \n", "[kwenta gumagana bumili bagong battery everead... | \n", "
| 15 | \n", "14 | \n", "96 | \n", "14_super_ganda_love_super_love_super | \n", "[super_ganda, love, super_love, super, love_th... | \n", "[see_get super_ganda talaga, legit super_ganda... | \n", "
| 16 | \n", "15 | \n", "94 | \n", "15_scent_perfume_smell_amoy | \n", "[scent, perfume, smell, amoy, cloud, coconut, ... | \n", "[sure anong smell perfume toh disappointed, wa... | \n", "
| 17 | \n", "16 | \n", "93 | \n", "16_god_bless_godbless_maraming_salamat_merry_c... | \n", "[god_bless, godbless, maraming_salamat, merry_... | \n", "[good maraming_salamat sayo seller ganda item ... | \n", "
| 18 | \n", "17 | \n", "91 | \n", "17_sound_sounds_mic_sound_quality | \n", "[sound, sounds, mic, sound_quality, basag, aud... | \n", "[basag sounds case, basag sound mic sira paran... | \n", "
| 19 | \n", "18 | \n", "85 | \n", "18_mascara_lipstick_lashes_eyeliner | \n", "[mascara, lipstick, lashes, eyeliner, lips, dr... | \n", "[mascara nice tho hold curl lashes curl lashes... | \n", "