| | import sys |
| | import os |
| | from datetime import datetime |
| | import pandas as pd |
| | import contexttimer |
| | from urllib.request import urlopen |
| | import requests |
| | from PIL import Image |
| | import torch |
| | from torchvision.transforms import functional as TF |
| | from multiprocessing import Pool |
| | from tqdm import tqdm |
| | import logging |
| | import sys |
| | import numpy as np |
| |
|
| |
|
| |
|
| | from nltk.tag import CRFTagger |
| | ct = CRFTagger() |
| | ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') |
| |
|
| | headers = { |
| | "User-Agent": "Googlebot-Image/1.0", |
| | "X-Forwarded-For": "64.18.15.200", |
| | } |
| |
|
| | |
| | logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO) |
| | requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) |
| |
|
| | '''if len(sys.argv) != 3: |
| | print("Provide .tsv file name & output directory. e.g. python downloader.py Train-GCC-training.tsv training") |
| | exit(1)''' |
| |
|
| | |
| | print(f'Starting to load at {datetime.now().isoformat(timespec="minutes")}') |
| |
|
| | with contexttimer.Timer(prefix="Loading from tsv"): |
| | df = pd.read_csv(sys.argv[1], delimiter='\t') |
| | df = df[["caption", "url"]] |
| |
|
| | def drop_no(text): |
| | try: |
| | if len(text)==0: |
| | return True |
| | elif len(text) > 96: |
| | return True |
| | text = text.split() |
| | result = ct.tag_sents([text]) |
| | nnp_cnt = 0 |
| | total = len(result[0]) |
| |
|
| | for x in result[0]: |
| | if x[1] == "NNP": |
| | nnp_cnt += 1 |
| | |
| | if (nnp_cnt/total)>=0.8: |
| | return True |
| | return False |
| | except Exception as e: |
| | print(e) |
| | return True |
| | |
| | df["to_drop"]=df["caption"].apply(drop_no) |
| | df = df[df["to_drop"]==False] |
| | df = df.drop("to_drop",axis=1) |
| |
|
| | df["index_row"] = df.index |
| |
|
| | df.to_csv(sys.argv[2], sep='\t',index=False) |
| |
|
| |
|