| import os | |
| from datasets.download.download_config import DownloadConfig | |
| from datasets.utils.file_utils import cached_path | |
| from datasets.utils.hub import hf_hub_url | |
| def get_readme_path(dataset_name): | |
| readme_path = hf_hub_url(dataset_name, "README.md") | |
| return cached_path(readme_path, download_config=DownloadConfig()) | |
| def update_readme(dataset_name, subreddit, date_to_fetch): | |
| path = get_readme_path(dataset_name=dataset_name) | |
| readme_text = f""" | |
| # Dataset Name | |
| {dataset_name} | |
| ## Update Frequency | |
| The dataset is updated daily and covers the period from `{os.environ["START_DATE"]}` to {date_to_fetch} | |
| ## Dataset Overview | |
| The goal is to have an open dataset of `{subreddit}` submissions. This has been taken from the Pushshift API. | |
| ## Data Collection | |
| This has been collected with sequential calls that follow the pagination of the pushshift request. | |
| ## Attribution | |
| Data sourced from the Pushshift API. | |
| """ | |
| append_readme(path=path, readme_text=readme_text) | |
| return readme_text | |
| def append_readme(path, readme_text): | |
| generated_below_marker = "--- Generated Below ---" | |
| with open(path, "r") as file: | |
| content = file.read() | |
| if generated_below_marker in content: | |
| index = content.index(generated_below_marker) + len(generated_below_marker) | |
| content = content[:index] + "\n\n" + readme_text | |
| else: | |
| content += "\n\n" + generated_below_marker + "\n\n" + readme_text + "\n" | |
| with open(path, "w") as file: | |
| file.write(content) | |