derek-thomas
commited on
Commit
·
1d46c26
1
Parent(s):
1b56724
Initializing cols
Browse files- utilities/pushshift_data.py +12 -0
utilities/pushshift_data.py
CHANGED
|
@@ -146,9 +146,21 @@ def submissions_to_dataframe(submissions: List[Dict[str, Any]]) -> pd.DataFrame:
|
|
| 146 |
'downs', 'ups']
|
| 147 |
df = pd.DataFrame(submissions)
|
| 148 |
df = df.convert_dtypes()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
df = df[cols]
|
|
|
|
| 150 |
# Convert the "created_utc" column to a datetime column with timezone information
|
| 151 |
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s').dt.tz_localize('UTC')
|
|
|
|
|
|
|
| 152 |
df['date'] = df['created_utc'].dt.date.astype(str)
|
| 153 |
df['time'] = df['created_utc'].dt.time.astype(str)
|
| 154 |
return df
|
|
|
|
| 146 |
'downs', 'ups']
|
| 147 |
df = pd.DataFrame(submissions)
|
| 148 |
df = df.convert_dtypes()
|
| 149 |
+
|
| 150 |
+
# As of Jan 2017 Im getting an error:
|
| 151 |
+
# KeyError: "['downs', 'ups'] not in index"
|
| 152 |
+
# To maintain backwards compatibility I will initialize these cols
|
| 153 |
+
for col in cols:
|
| 154 |
+
if col not in df.columns:
|
| 155 |
+
df[col] = None
|
| 156 |
+
|
| 157 |
+
# Take the subset of columns
|
| 158 |
df = df[cols]
|
| 159 |
+
|
| 160 |
# Convert the "created_utc" column to a datetime column with timezone information
|
| 161 |
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s').dt.tz_localize('UTC')
|
| 162 |
+
|
| 163 |
+
# Using native type date and time had some incompatibility with the datasets visualization widget
|
| 164 |
df['date'] = df['created_utc'].dt.date.astype(str)
|
| 165 |
df['time'] = df['created_utc'].dt.time.astype(str)
|
| 166 |
return df
|