congress-legislators/scripts/alternate_bulk_formats.py at main · unitedstates/congress-legislators · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import csv
import json
import glob
import os

import utils


def generate_csv():

	#yaml filenames
	yamls = ["legislators-current.yaml","legislators-historical.yaml"]
	yaml_social = "legislators-social-media.yaml"


	#list of yaml field name, csv column name tuples. Split into categories which do not reflect yaml structure (structured for logical csv column ordering)
	bio_fields = [
	("last", "last_name"),
	("first", "first_name"),
	("middle", "middle_name"),
	("suffix", "suffix"),
	("nickname", "nickname"),
	("official_full", "full_name"),
	("birthday", "birthday"),
	("gender", "gender")
	]

	#ID crosswalks, omit FEC id's, which may contain (arbitrary?) number of values
	crosswalk_fields = [
	("bioguide", "bioguide_id"),
	("thomas", "thomas_id"),
	("opensecrets", "opensecrets_id"),
	("lis","lis_id"),
	("fec","fec_ids"),
	("cspan", "cspan_id"),
	("govtrack", "govtrack_id"),
	("votesmart", "votesmart_id"),
	("ballotpedia", "ballotpedia_id"),
	("washington_post", "washington_post_id"),
	("icpsr", "icpsr_id"),
	("wikipedia", "wikipedia_id")
	]

	#separate list for children of "terms", csv only captures data for most recent term
	#currently excluding start/end dates - earliest start to latest end is deceptive (excludes gaps) as is start/end for most recent term
	term_fields = [
	("type", "type"),
	("state", "state"),
	("district", "district"),
	("class", "senate_class"),
	("party", "party"),
	("url", "url"),
	("address", "address"),
	("phone", "phone"),
	("contact_form", "contact_form"),
	("rss_url", "rss_url"),
	]

	#pulled from legislators-social-media.yaml
	social_media_fields = [
	("twitter", "twitter"),
	("twitter_id", "twitter_id"),
	("facebook", "facebook"),
	("youtube", "youtube"),
	("youtube_id", "youtube_id"),
	("mastodon", "mastodon")
	]


	print("Loading %s..." %yaml_social)
	social = utils.load_data(yaml_social)

	for filename in yamls:
		print("Converting %s to CSV..." % filename)

		legislators = utils.load_data(filename)

		#convert yaml to csv
		csv_output = csv.writer(open("../" + filename.replace(".yaml", ".csv"),"w"))

		head = []
		for pair in bio_fields:
			head.append(pair[1])
		for pair in term_fields:
			head.append(pair[1])
		for pair in social_media_fields:
			head.append(pair[1])
		for pair in crosswalk_fields:
			head.append(pair[1])
		csv_output.writerow(head)

		for legislator in legislators:
			legislator_row = []
			for pair in bio_fields:
				if 'name' in legislator and pair[0] in legislator['name']:
					legislator_row.append(legislator['name'][pair[0]])
				elif 'bio' in legislator and pair[0] in legislator['bio']:
					legislator_row.append(legislator['bio'][pair[0]])
				else:
					legislator_row.append(None)

			for pair in term_fields:
				latest_term = legislator['terms'][len(legislator['terms'])-1]
				if pair[0] in latest_term:
					legislator_row.append(latest_term[pair[0]])
				else:
					legislator_row.append(None)

			social_match = None
			for social_legislator in social:
				if 'bioguide' in legislator['id'] and 'bioguide' in social_legislator['id'] and legislator['id']['bioguide'] == social_legislator['id']['bioguide']:
					social_match = social_legislator
					break
				elif 'thomas' in legislator['id'] and 'thomas' in social_legislator['id'] and legislator['id']['thomas'] == social_legislator['id']['thomas']:
					social_match = social_legislator
					break
				elif 'govtrack' in legislator['id'] and 'govtrack' in social_legislator['id'] and legislator['id']['govtrack'] == social_legislator['id']['govtrack']:
					social_match = social_legislator
					break
			for pair in social_media_fields:
				if social_match != None:
					if pair[0] in social_match['social']:
						legislator_row.append(social_match['social'][pair[0]])
					else:
						legislator_row.append(None)
				else:
					legislator_row.append(None)

			for pair in crosswalk_fields:
				if pair[0] in legislator['id']:
					value = legislator['id'][pair[0]]
					if isinstance(value, list):
						# make FEC IDs comma-separated
						value = ",".join(value)
					legislator_row.append(value)
				else:
					legislator_row.append(None)

			csv_output.writerow(legislator_row)

	generate_district_office_csv()


def generate_district_office_csv():
	filename = "legislators-district-offices.yaml"
	print("Converting %s to CSV..." % filename)
	legislators_offices = utils.load_data(filename)
	fields = [
		"bioguide", "thomas", "govtrack", "id", "address", "building",
		"city", "fax", "hours", "phone", "state", "suite", "zip",
		"latitude", "longitude"]

	f = open("../" + filename.replace(".yaml", ".csv"), "w")
	csv_output = csv.DictWriter(f, fieldnames=fields)
	csv_output.writeheader()

	for legislator_offices in legislators_offices:
		legislator_ids = legislator_offices['id']
		for office in legislator_offices['offices']:
			office.update(legislator_ids)
			csv_output.writerow(office)


def generate_json():

	#yaml filenames
    yamls = list(map(os.path.basename, glob.glob("../*.yaml")))

    for filename in yamls:
        print("Converting %s to JSON..." % filename)
        data = utils.load_data(filename)
        '''handle edge case of incorrect coercion for twitter ids in social media data
    		json/js can only handle maximum of 53-bit integers, so 64-bit integer twitter ids *must* be stringified
    		to consistently preserve value in json. otherwise they may be rounded and malformed
    	'''
        if 'legislators-social-media' in filename:
        	for social_legislator in data:
        		if 'twitter_id' in social_legislator['social']:
        			social_legislator['social']['twitter_id'] = str(social_legislator['social']['twitter_id'])

		#convert yaml to json
        utils.write(
            json.dumps(data, default=utils.format_datetime, indent=2),
            "../" + filename.replace(".yaml", ".json"))

if __name__ == '__main__':
	generate_csv()
	generate_json()