forked from kennyyu/bootcamp-scraper
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathexample_soln.py
More file actions
83 lines (64 loc) · 2.59 KB
/
example_soln.py
File metadata and controls
83 lines (64 loc) · 2.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import bs4
import urllib
AMAZON_URL = "http://www.amazon.com/s/?keywords=ocaml"
def float_of_money(s):
"""
Converts the money string to a float. For amounts in the format
$30.00 - $65.00
this will return the maximum bound as a float (65.00)
"""
s = str(s)
s = s.replace(",", "") # Remove commas in numbers
return float(s[s.rfind("$") + 1:])
# TODO: write a function that returns True if the tag is a div,
# has an id, and has an id that starts with "result_"
def is_result_div(tag):
if not tag.name == "div":
return False
return tag.has_key("id") and str(tag["id"]).startswith("result_")
if __name__ == "__main__":
# get html webpage
urlobject = urllib.urlopen(AMAZON_URL)
# create a beautiful soup object to read data from the webpage
soup = bs4.BeautifulSoup(urlobject.read())
# print title of web page
print soup.title
print soup.title.text
# find all divs with ids starting with "result_"
result_divs = soup.find_all(is_result_div)
print len(result_divs)
# print out first one to see structure
if len(result_divs) > 0:
print result_divs[0]
# loop over all results
for result_div in result_divs:
# holds general information about product
title_div = result_div.find(class_="productTitle")
# TODO: extract link from title_div
link = title_div.a["href"]
# TODO: extract title
# lstrip() removes extra white spaces
title = str(title_div.a.string).lstrip()
# TODO: extract author if it exists
author = ""
if title_div.span is not None:
# remove all the links and extra stuff to just get the text
text = "".join(map(lambda z: str(z.string),
title_div.span.contents))
# remove extra whitespace
text_no_white = text.lstrip()
# remove the "by" in "by John Doe"
author = text_no_white[3:]
# divs that hold new and used prices
new_price_div = result_div.find(class_="newPrice")
used_price_div = result_div.find(class_="usedNewPrice")
# TODO: extract used and new prices
# HINT: use float_of_money
new_price = 0.
if new_price_div is not None:
new_price = float_of_money(new_price_div.span.string)
used_price = 0.
if used_price_div is not None and used_price_div.div.span is not None:
used_price = float_of_money(used_price_div.div.span.string)
# all the information we want!
print (title, author, link, new_price, used_price)