forked from Tss20/language-complexity
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraping.py
More file actions
35 lines (25 loc) · 925 Bytes
/
scraping.py
File metadata and controls
35 lines (25 loc) · 925 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import requests
from bs4 import BeautifulSoup
import regex as re
def scrape(url):
page=requests.get(url)
text=page.text
text=text[text.index("<div id=\"mw-content-text\""):]
text=text[:text.index("<span class=\"mw-headline\" id=\"References\">")]
text=text[text.index("<p>"):]
#print(page.text)
#text=re.sub("<h3(?s).*</h3>", "", text)
text=re.sub("<[^>]*>", "", text)
results=""
for line in text.split("\n"):
if not re.match("\[.*?|.*?\]", line):
results+=line+"\n"
results=re.sub("&\#(.*?);(\:{0,1})(0|1|2|3|4|5|6|7|8|9){0,3}", "", results)
#soup = BeautifulSoup(page.content, "html.parser")
#results=soup.find(id="mw-content-text")
return results
#return results.prettify()
if __name__=="__main__":
#scrape("https://simple.wikipedia.org/wiki/Dan_Kelly")
print(scrape("https://simple.wikipedia.org/wiki/Dan_Kelly"))
pass