-
Notifications
You must be signed in to change notification settings - Fork 521
/
Copy pathhouse_websites.py
executable file
·111 lines (86 loc) · 3.34 KB
/
house_websites.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
# Uses https://www.house.gov/representatives/ to scrape official member websites.
# Only known source.
# Assumptions:
# member's state and district fields are present and accurate.
# member's most recent term in the terms field is their current one.
import lxml.html, io, urllib.request, urllib.error, urllib.parse
import re
import utils
from utils import load_data, save_data, states as state_names
def run():
# default to not caching
cache = utils.flags().get('cache', False)
force = not cache
states = []
current = load_data("legislators-current.yaml")
by_district = { }
for m in current:
last_term = m['terms'][-1]
if last_term['type'] != 'sen':
state = last_term['state']
full_district = "%s%02d" % (state, int(last_term['district']))
by_district[full_district] = m
if not state in states:
states.append(state)
destination = "legislators/house.html"
url = "https://www.house.gov/representatives/"
body = utils.download(url, destination, force)
if not body:
print("Couldn't download House listing!")
exit(0)
try:
dom = lxml.html.parse(io.StringIO(body)).getroot()
except lxml.etree.XMLSyntaxError:
print("Error parsing House listing!")
exit(0)
# process:
# go through every state in our records, fetching that state's table
# go through every row after the first, pick the district to isolate the member
# pluck out the URL, update that member's last term's URL
count = 0
for state in sorted(states):
state_name = state_names[state].lower().replace(' ', '-')
table = dom.cssselect("table.table caption#state-%s" % state_name)[0].getparent()
rows = table.cssselect("tbody tr")
for row in rows:
cells = row.cssselect("td")
if not cells:
continue
district = str(cells[0].text_content()).strip()
if (
(district == "At Large")
or (district == "Delegate")
or (district == "Resident Commissioner")
):
district = 0
else:
district = int(re.sub(r'[^\d]', '', district))
url = cells[1].cssselect("a")[0].get("href")
original_url = url
# The House uses subdomains now, and occasionally the directory
# uses URLs with some trailing redirected-to page, like /home.
# We can safely use the subdomain as the root, to be future-proof
# against redirects changing mid-session.
# We should still follow any redirects, and not just trust the
# directory to have the current active subdomain. As an example,
# the directory lists randyforbes.house.gov, which redirects to
# forbes.house.gov.
resp = urllib.request.urlopen(url)
url = resp.geturl()
# kill everything after the domain
url = re.sub(".gov/.*$", ".gov", url)
if state == "AQ":
state = "AS"
full_district = "%s%02d" % (state, int(district))
if full_district in by_district:
print("[%s] %s %s" % (full_district, url, "" if url == original_url.rstrip("/") else (" <= " + original_url)))
by_district[full_district]['terms'][-1]['url'] = url
else:
print("[%s] No current legislator" % full_district)
count += 1
print("Processed %i people rows on House listing." % count)
print("Saving data...")
save_data(current, "legislators-current.yaml")
if __name__ == '__main__':
run()