17
17
# pip install -r requirements.txt
18
18
import mechanicalsoup
19
19
20
- USER_AGENT = ('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 '
21
- '(KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' )
20
+ USER_AGENT = (
21
+ "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 "
22
+ "(KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
23
+ )
22
24
23
25
regex1 = re .compile (
24
26
r'<a href="https://www.congress.gov/member/[^/]+/(\w+)[^<]+</a></span>'
25
- '[^<]*<div[^<]+<div class="member-image"><img src="/img/member/([^\" ]+)"' )
27
+ '[^<]*<div[^<]+<div class="member-image"><img src="/img/member/([^"]+)"'
28
+ )
26
29
27
30
regex2 = re .compile ('<a class="next" href="([^"]+)">' )
28
31
@@ -49,18 +52,20 @@ def get_photo_list(br, congress_number, delay):
49
52
# Fetch a page of results from Congress.gov.
50
53
print ("Page %d of Congress.gov Member listing..." % page )
51
54
response = br .get (
52
- "https://www.congress.gov/search?" +
53
- urlencode ({
54
- "q" : json .dumps (
55
- {"source" : "members" ,
56
- "congress" : str (congress_number )}),
57
- "pageSize" : 250 ,
58
- "page" : page ,
59
- })).text
55
+ "https://www.congress.gov/search?"
56
+ + urlencode (
57
+ {
58
+ "q" : json .dumps (
59
+ {"source" : "members" , "congress" : str (congress_number )}
60
+ ),
61
+ "pageSize" : 250 ,
62
+ "page" : page ,
63
+ }
64
+ )
65
+ ).text
60
66
61
67
if len (response ) == 0 :
62
- sys .exit ("Page is blank. Try again later, you may have hit a "
63
- "limit." )
68
+ sys .exit ("Page is blank. Try again later, you may have hit a limit." )
64
69
65
70
# Scan for links to Member pages and img tags. The link to the
66
71
# Congress.gov page uses the Member's Bioguide ID as the key, and the
@@ -116,8 +121,7 @@ def download_photos(br, photo_list, outdir, delay):
116
121
ok = 0
117
122
118
123
for bioguide_id , photo_filename in photo_list :
119
- photo_url = ("https://memberguide.gpo.gov/PictorialImages/" +
120
- photo_filename )
124
+ photo_url = "https://memberguide.gpo.gov/PictorialImages/" + photo_filename
121
125
print (bioguide_id , photo_url )
122
126
123
127
filename = os .path .join (outdir , bioguide_id + ".jpg" )
@@ -145,20 +149,35 @@ def resize_photos():
145
149
if __name__ == "__main__" :
146
150
parser = argparse .ArgumentParser (
147
151
description = "Scrape https://memberguide.gpo.gov and save "
148
- "members' photos named after their Bioguide IDs" ,
149
- formatter_class = argparse .ArgumentDefaultsHelpFormatter )
152
+ "members' photos named after their Bioguide IDs" ,
153
+ formatter_class = argparse .ArgumentDefaultsHelpFormatter ,
154
+ )
150
155
parser .add_argument (
151
- '-n' , '--congress' , default = '114' ,
152
- help = "Congress session number, for example: 110, 111, 112, 113" )
156
+ "-n" ,
157
+ "--congress" ,
158
+ default = "114" ,
159
+ help = "Congress session number, for example: 110, 111, 112, 113" ,
160
+ )
153
161
parser .add_argument (
154
- '-o' , '--outdir' , default = "congress/original" ,
155
- help = "Directory to save photos in" )
162
+ "-o" ,
163
+ "--outdir" ,
164
+ default = "congress/original" ,
165
+ help = "Directory to save photos in" ,
166
+ )
156
167
parser .add_argument (
157
- '-d' , '--delay' , type = int , default = 5 , metavar = 'seconds' ,
158
- help = "Rate-limiting delay between scrape requests" )
168
+ "-d" ,
169
+ "--delay" ,
170
+ type = int ,
171
+ default = 5 ,
172
+ metavar = "seconds" ,
173
+ help = "Rate-limiting delay between scrape requests" ,
174
+ )
159
175
parser .add_argument (
160
- '-t' , '--test' , action = 'store_true' ,
161
- help = "Test mode: don't actually save images" )
176
+ "-t" ,
177
+ "--test" ,
178
+ action = "store_true" ,
179
+ help = "Test mode: don't actually save images" ,
180
+ )
162
181
args = parser .parse_args ()
163
182
164
183
br = mechanicalsoup .Browser ()
0 commit comments