-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_content_for_month.py
More file actions
110 lines (89 loc) · 3.15 KB
/
get_content_for_month.py
File metadata and controls
110 lines (89 loc) · 3.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import sys
import pandas as pd
import requests
import re
import time
from tqdm import tqdm
import mwparserfromhell as mwp
CONTENT_URL = "https://en.wikipedia.org/w/api.php?action=parse&oldid={0}&prop=wikitext&formatversion=2&format=json"
def _parse_and_clean_wikicode(raw_content):
"""Strips formatting and unwanted sections from raw page content."""
wikicode = mwp.parse(raw_content)
# Filters for references, tables, and file/image links.
re_rm_wikilink = re.compile(
"^(?:File|Image|Media):", flags=re.IGNORECASE | re.UNICODE
)
def rm_wikilink(obj):
return bool(re_rm_wikilink.match(str(obj.title))) # pytype: disable=wrong-arg-types
def rm_tag(obj):
return str(obj.tag) in {"ref", "table"}
def rm_template(obj):
return obj.name.lower() in {
"reflist",
"notelist",
"notelist-ua",
"notelist-lr",
"notelist-ur",
"notelist-lg",
}
def try_remove_obj(obj, section):
try:
section.remove(obj)
except ValueError:
# For unknown reasons, objects are sometimes not found.
pass
section_text = []
# Filter individual sections to clean.
for section in wikicode.get_sections(
flat=True, include_lead=True, include_headings=True
):
for obj in section.ifilter_wikilinks(matches=rm_wikilink, recursive=True):
try_remove_obj(obj, section)
for obj in section.ifilter_templates(matches=rm_template, recursive=True):
try_remove_obj(obj, section)
for obj in section.ifilter_tags(matches=rm_tag, recursive=True):
try_remove_obj(obj, section)
section_text.append(section.strip_code().strip())
return "\n\n".join(section_text)
def get_valid_filename(s):
s = str(s).strip().replace(' ', '_')
return re.sub(r'(?u)[^-\w.]', '', s)
def get_content_for_revision_id(revision_id, last_try: bool = False):
try:
retries = 3
while retries > 0:
content = requests.get(url=CONTENT_URL.format(revision_id)).json()
if "parse" not in content:
print(f"Failed to find for {revision_id}, retrying")
retries -= 1
time.sleep(1)
else:
return content["parse"]["wikitext"]
except Exception as e:
time.sleep(1)
if not last_try:
print(f"Retrying")
return get_content_for_revision_id(revision_id, last_try=True)
else:
print(f"Failed to find for {revision_id}, giving up")
return None
def get_content_for_article(row, save_path):
for month_no, month in enumerate(row.index):
if month_no <= 1:
continue
if row[month] != -1:
content = get_content_for_revision_id(row[month])
clean_content = _parse_and_clean_wikicode(content).replace("===", " ").replace("==", " ")
save_folder = "-".join(month.split("-")[:2])
if not os.path.exists(f"{save_path}/{save_folder}"):
os.makedirs(f"{save_path}/{save_folder}")
with open(f"{save_path}/{save_folder}/{get_valid_filename(row['title'])}-{int(row[month])}.txt", "w", encoding="utf-8") as f:
f.write(row["title"] + "\n\n" + clean_content)
if __name__ == "__main__":
assert len(sys.argv) > 2, "Provide a file with revision ids and a save path location"
content_file_path = sys.argv[1]
save_path = sys.argv[2]
tqdm.pandas()
df = pd.read_csv(content_file_path)
df.progress_apply(lambda x: get_content_for_article(x, save_path), axis=1)