-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.rb
executable file
·119 lines (97 loc) · 3.92 KB
/
scrape.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env ruby
require 'uri'
require 'net/http'
require 'json'
if (ARGV.length == 0 || ARGV.length > 2 || ["-h", "--help", "-?", "/?"].include?(ARGV[0]))
puts "ERROR: Invalid parameters"
puts ""
puts "Usage: scrape.rb DOCKET_ID [API_KEY]"
puts "Example: scrape.rb NIST-2021-0007 gjekwWKTJD1289dJAKDjf93"
puts " (not a valid API key)"
puts ""
puts "JSON output is written to comments_[timestamp].json"
puts "Then, to generate HTML table: make_html.rb comments_[timestamp].json"
puts "Or all in one step: scrape.rb DOCKET_ID [APID_KEY] | make_html.rb"
puts ""
puts "Without an API key, application will hang at under 30 comments retrieved."
puts "Request a key at https://open.gsa.gov/api/regulationsgov/"
puts ""
exit 1
end
API_KEY = ARGV[1] || "DEMO_KEY"
# TODO: add rate limiting. 1000 requests per hour
def reggov_request(endpoint, params={})
uri = URI("https://api.regulations.gov/v4/" + endpoint)
uri.path.gsub!('//', '/') # allow caller to either omit or include leading slash in endpoint
uri.query = URI.encode_www_form(params.merge({"api_key" => API_KEY}))
res = Net::HTTP.get_response(uri)
raise "Failed Request" unless (res.is_a?(Net::HTTPSuccess))
return JSON.parse(res.body)
end
def get_documents(docket_id)
response = reggov_request("/documents", "filter[docketId]" => docket_id)
return response["data"].map {|d|
{id: d["id"],
title: d["attributes"]["title"],
object_id: d["attributes"]["objectId"]}
}
end
# NOTE: does not work if document has more than 5000 comments. For documents with >5000 comments, a different technique
# is required. See documentation at https://open.gsa.gov/api/regulationsgov/#searching-for-comments-1
def get_comments(object_id)
comments = []
page_number = 1
while true
# Do not remove sort field, will cause duplicates
response = reggov_request("/comments", "filter[commentOnId]" => object_id,
"page[size]" => 250,
"page[number]" => page_number,
"sort" => "lastModifiedDate" )
comments += response["data"].map { |c| c["id"] }
if response["meta"]["hasNextPage"] then page_number += 1 else break end
end
return comments
end
def get_comment(comment_id)
response = reggov_request("/comments/#{comment_id}", include: "attachments")
c = response["data"]["attributes"]
comment = {
organization: c["organization"],
firstName: c["firstName"],
lastName: c["lastName"],
city: c["city"],
state: c["stateProvinceRegion"],
country: c["country"],
id: comment_id,
comment: c["comment"]
}
comment["attachments"] = response["included"]&.map { |a| {title: a.dig("attributes","title"),
urls: a.dig("attributes").dig("fileFormats")&.map { |f| f["fileUrl"]} }
}
return comment
end
def progress(current, target)
STDERR.print "\r"
pad_to = target.to_s.length
STDERR.print "#{current.to_s.rjust(pad_to, "0")} of #{target}"
STDERR.flush
end
docket_id = ARGV[0]
documents = get_documents(docket_id)
documents.each do |d|
d[:comments] = get_comments(d[:object_id])
num_comments = d[:comments].length
STDERR.puts "Getting #{num_comments} comments for document #{d[:id]} - #{d[:title]}"
cur = 1
d[:comments].map! do |c|
progress(cur, num_comments)
cur += 1
get_comment(c)
end
STDERR.print "\n"
end
output_json = JSON.pretty_generate(documents)
output_filename = File.expand_path("comments_#{Time.now.to_i}.json")
File.write(output_filename, output_json)
STDERR.puts "Comment data saved to: #{output_filename}\n"
puts output_filename unless STDOUT.tty?