1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
| #!/usr/bin/env perl
use strict;
use LWP::UserAgent;
use HTTP::Cookies;
use HTML::TreeBuilder 3;
my $OUTPUT_FILE = 'nyt_top_stories.txt';
# User agent needs to accept cookies to access NYT
my $cookie = 'nyt_cookie.lwp';
my $cookie_jar = HTTP::Cookies->new('file' => $cookie, 'autosave' => 1);
my $content = get_html('http://global.nytimes.com/');
my $tree = HTML::TreeBuilder->new_from_content($content);
# Stores homepage URLS
my @urls;
scan_nyt_tree($tree, 'http://global.nytimes.com/');
$tree->delete();
unlink $OUTPUT_FILE;
# Scrape article from each URL
foreach (@urls) {
$content = get_html($_);
# Replace all newline characters, needed for $rawtext extraction
$content =~ s/\n//g;
# Extracts headline, byline, dateline, and raw text
my $headline;
if ($content =~ m/<nyt_headline .*?>(.*?)< \/NYT_HEADLINE>/) {
$headline = $1;
}
my $byline;
if ($content =~ m/<nyt_byline .*?>.*?<a \shref.*?>(.*?)< \/a>/) {
$byline = $1;
}
my $dateline;
if ($content =~ m/class="dateline">.*?Published:\s+([\w\s,]+)< \//) {
$dateline = $1;
}
my $rawtext;
if ($content =~ m/<NYT_TEXT.*?>(.*)< \/NYT_TEXT>/) {
$rawtext = $1;
}
# Parses article's text by extracting everything between <p> tags
my $text;
while ($rawtext =~ m/</p><p>(.*?)< \/p>/g) {
$text .= "\n\n$1";
}
$text =~ s/ +/ /g; # REPLACE MUTLIPLE SPACES WITH ONE
$text =~ s/< .*?>//g; # REMOVE HTML TAGS
$text =~ s/—/--/g; # REPLACE HTML EM-DASH CODE WITH 2 HYPHENS
$text =~ s/'|‘/'/g; # REPLACE SMART APOSTROPHES WITH '
$text =~ s/"|”/"/g; # REPLACE SMART QUOTATIONS WITH "
$text =~ s/ / /g;
open(OUTPUT, ">>$OUTPUT_FILE") or die("Cannot open $OUTPUT_FILE\n");
print OUTPUT "$headline\n$byline\n$dateline$text\n\n\n";
close(OUTPUT);
}
# Stores a web page's HTML as string
sub get_html {
my $url = $_[0];
my $browser = LWP::UserAgent->new();
$browser->cookie_jar($cookie_jar);
# $response declared out here to be accessible after while loop
my $response;
# Prevents inifinite loops
my $redirect_limit = 5;
my $x = 0;
# Sends GET request, follows redirects until response code 200 received
# Stores successful request URL
my $responseCode = 0;
while ($responseCode != 200 && $x < $redirect_limit) {
$response = $browser->get($url);
$responseCode = $response->code;
print "$url\n";
#print "response code: $responseCode\n";
$url = $response->header('Location');
$x++;
}
return $response->content;
}
# Picks out URLs of top NYT articles
sub scan_nyt_tree {
my ($root, $docbase) = @_;
foreach my $div ($root->find_by_tag_name('div')) {
my $class = $div->attr('class') || next;
if ($class eq 'story') {
my @children = $div->content_list;
for (my $i = 0; $i < = $#children; $i++) {
if (ref $children[$i] and
($children[$i]->tag eq 'h2' ||
$children[$i]->tag eq 'h3' ||
$children[$i]->tag eq 'h5')) {
my @grandchildren = $children[$i]->content_list;
# Search sibling if 1st grandchild not <a>
if (ref $grandchildren[0] and $grandchildren[0]->tag eq 'a') {
push (@urls, URI->new_abs($grandchildren[0]->attr('href') || next, $docbase));
}
}
}
}
}
return;
}
|