-
Notifications
You must be signed in to change notification settings - Fork 521
/
Copy pathpictorial_ids.py
207 lines (175 loc) · 7.2 KB
/
pictorial_ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/usr/bin/env python
import csv
import json
import unicodedata
import utils
from utils import load_data, mkdir_p, save_data, parse_date
# Update legislators current pictorial ids
# https://pictorialapi.gpo.gov/index.html
#
# options:
# --cache: load from cache if present on disk (default: false)
# --bioguide: load only one legislator, by their bioguide ID
# --congress: do *only* updates for legislators serving in specific congress
#
# example:
# python pictorial_ids.py --congress=118
def run():
# default to not caching
cache = utils.flags().get("cache", False)
force = not cache
only_bioguide = utils.flags().get("bioguide", None)
congress = utils.flags().get("congress", None)
data_files = []
print("Loading %s..." % "legislators-current.yaml")
legislators = load_data("legislators-current.yaml")
data_files.append((legislators, "legislators-current.yaml"))
print("Loading %s..." % "legislators-historical.yaml")
legislators = load_data("legislators-historical.yaml")
data_files.append((legislators, "legislators-historical.yaml"))
if congress == None:
raise Exception("the --congress flag is required")
elif int(congress) >= 110:
# Pictorial seems to go back to 110th Congress
url = f"https://pictorialapi.gpo.gov/api/GuideMember/GetMembers/{congress}"
pass
else:
raise Exception("no data for congress " + congress)
pictorial_destination = f"pictorial/source/GetMembers/{congress}.json"
pictorial_data = json.loads(utils.download(url, pictorial_destination, force))
# Filter out non-legislators and the vacant placeholders
pictorial_members = [
member
for member in pictorial_data["memberCollection"]
if member["memberType"] in ("Senator", "Representative", "Delegate")
and member["name"] != "Vacant, Vacant"
]
error_filename = f"cache/errors/pictorial/mismatch_{congress}.csv"
mkdir_p("cache/errors/pictorial")
error_log = csv.writer(open(error_filename, "w"))
error_log.writerow(
[
"message",
"bioguide_id",
"name_first",
"name_last",
]
)
error_count = 0
print("Running for congress " + congress)
for legislators, filename in data_files:
for legislator in legislators:
# this can't run unless we've already collected a bioguide for this person
bioguide = legislator["id"].get("bioguide", None)
# if we've limited this to just one bioguide, skip over everyone else
if only_bioguide and (bioguide != only_bioguide):
continue
# only run for selected congress
latest_term = legislator["terms"][-1]
latest_congress = utils.congress_from_legislative_year(
utils.legislative_year(parse_date(latest_term["start"]))
)
if int(congress) != latest_congress:
continue
# skip if we already have it
if legislator["id"].get("pictorial"):
continue
try:
pictorial_id = match_pictorial_id(legislator, pictorial_members)
legislator["id"]["pictorial"] = pictorial_id
except ValueError as e:
error_count += 1
error_log.writerow(
[
e,
bioguide,
legislator["name"]["first"],
legislator["name"]["last"],
]
)
save_data(legislators, filename)
if error_count:
print(f"{error_count} error details written to {error_filename}")
def to_ascii(s):
return unicodedata.normalize("NFKD", s).encode("ASCII", "ignore").decode("ASCII")
def reverse_name(name):
"""
Given a name in "Last, First" format, return "First Last"
"""
return " ".join(name.split(", ")[::-1])
def match_pictorial_id(legislator, pictorial_members):
"""
Attempt to find the corresponding pictorial id for the given member.
There are many odd cases -- see tests/test_gpo_member_photos.py for
examples.
"""
name = legislator["name"]["official_full"]
# Map common nicknames (and GPO typos) from legislators to pictorial
common_nicknames = {
"Nick": "Nicolas",
"Daniel": "Dan",
"Mike": "Michael",
"Michael": "Mike",
"Richard": "Rich",
"Christopher": "Chris",
"JOhn": "John",
}
matches = []
for member_pictorial in pictorial_members:
# First check whether the name matches
name_matches = False
legislator_name_last = to_ascii(legislator["name"]["last"].replace(" ", ""))
legislator_name_first = to_ascii(legislator["name"]["first"].replace(" ", ""))
if legislator_name_last == member_pictorial["lastName"]:
if legislator_name_first == member_pictorial["firstName"] or (
"nickname" in legislator["name"]
and legislator["name"]["nickname"] == member_pictorial["firstName"]
):
name_matches = True
# Sometimes the nickname is encoded in the first name
elif member_pictorial["firstName"] in legislator_name_first:
name_matches = True
# Sometimes the nickname is encoded in the middle name
elif (
"middle" in legislator["name"]
and member_pictorial["firstName"] in legislator["name"]["middle"]
):
name_matches = True
# Sometimes the nickname is not encoded
elif (
member_pictorial["firstName"] in common_nicknames
and common_nicknames[member_pictorial["firstName"]]
== legislator_name_first
):
name_matches = True
# Sometimes matching the official full name is best
if legislator["name"]["official_full"] == reverse_name(
member_pictorial["name"]
):
name_matches = True
# The GPO has some first and last names swapped, so check those too
if not name_matches and legislator_name_first == member_pictorial["lastName"]:
if legislator_name_last == member_pictorial["firstName"] or (
"nickname" in legislator["name"]
and legislator["name"]["nickname"] == member_pictorial["firstName"]
):
name_matches = True
# If the name matches, check the office and state
# Note: Assumes we're matching against most recent term
if name_matches:
most_recent_term = legislator["terms"][-1]
mType = "sen" if member_pictorial["memberType"] == "Senator" else "rep"
if (
most_recent_term["state"] == member_pictorial["stateId"]
and most_recent_term["type"] == mType
):
matches.append(member_pictorial)
if len(matches) == 1:
return matches[0]["memberId"]
else:
if len(matches):
raise ValueError(f"Multiple pictorial id matches found for {name}")
else:
raise ValueError(f"No pictorial id match found for {name}")
if __name__ == "__main__":
run()