1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
| import os
import urllib2
import cookielib
import re
import htmlentitydefs
import codecs
import time
from BeautifulSoup import BeautifulSoup
URL_REQUEST_DELAY = 1
BASE = 'http://www.economist.com'
TXDATA = None
TXHEADERS = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
OUTPUT_FILE = 'economist.txt'
def request_url(url, txdata, txheaders):
"""Gets a webpage's HTML"""
req = Request(url, txdata, txheaders)
handle = urlopen(req)
html = handle.read()
return html
def remove_html_tags(data):
"""Removes HTML tags"""
p = re.compile(r'< .*?>')
return p.sub('', data)
# Converts HTML character codes to Unicode code points
def unescape(text):
"""
Converts HTML character codes to Unicode code points.
@param text the HTML (or XML) source text in any encoding.
@return The plain text, as a Unicode string, if necessary.
"""
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text
return re.sub("&#?\w+;", fixup, text)
urlopen = urllib2.urlopen
Request = urllib2.Request
html = request_url('http://www.economist.com/printedition/', TXDATA, TXHEADERS)
# Use BeautifulSoup to easily navigate HTML tree
soup = BeautifulSoup(html)
# Retrieves HTML from URLs on Economist's homepage with "block" divs
urls = []
for block in soup.findAll('div', {'class': 'block'}):
for hTag in block.findAll({'h2' : True}):
if hTag.find('a'):
urls.append(hTag.find('a')['href'])
# Appends URL if it's a relative link.
# Create a copy of list b/c you can't modify a list while iterating over it.
for i, url in enumerate(urls[:]):
if not url.startswith('http'):
urls[i] = BASE + url
# Extracts headline, byline, dateline and content; outputs to file
if os.path.exists(OUTPUT_FILE):
os.remove(OUTPUT_FILE)
output = codecs.open(OUTPUT_FILE, 'a', 'utf-8')
for url in urls:
content = ''
html = request_url(url, TXDATA, TXHEADERS)
html = unicode(html, 'utf-8', errors='ignore')
soup = BeautifulSoup(html)
if soup.find('div', {'id' : 'ec-article-body'}):
body = soup.find('div', {'id' : 'ec-article-body'})
# Get headlines
if body.find('h1'):
h_one = body.find('h1').renderContents().strip()
if not h_one == '':
output.write(unicode(h_one + "\n", 'utf-8'))
if body.find('div', {'class' : 'headline'}):
headline = unescape(unicode(body.find('div', {'class' : 'headline'})\
.renderContents(), 'utf-8')).strip()
if not headline == '':
print headline
output.write(headline + u'\n')
if body.find('h2', {'class' : 'rubric'}):
h_two = body.find('h2', {'class' : 'rubric'})\
.renderContents().strip()
if not h_two == '':
output.write(unicode(h_two + "\n", 'utf-8'))
# Get date and location
if body.find('p', {'class' : 'ec-article-info'}):
article_info = body.find('p', {'class' : 'ec-article-info'})\
.renderContents().strip()
article_info = remove_html_tags(article_info)
article_info = re.sub(" +", " ", article_info)
output.write(unicode(article_info + "\n\n", 'utf-8'))
# Get content
for p in body.findAll('p', {'class': None}):
content = content + p.renderContents().strip() + "\n\n"
content = remove_html_tags(content)
content = re.sub(" +", " ", content)
content = unicode(content, 'utf-8')
content = unescape(content)
content = content + "\n"
output.write(content)
time.sleep(URL_REQUEST_DELAY)
output.close()
|