-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlegis_getpages.py
106 lines (88 loc) · 2.74 KB
/
legis_getpages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
#import legislators_current as leg
import legislators_other as leg
import cache
import urllib
import encode
import re
from cStringIO import StringIO
import pprint
from urlparse import urlunsplit,urlsplit
import dump
_legs=None
_glob=None
import json
delete_data=False
"""
this will index the phone numbers and websites to find the legislators
we can use that to look for links in the washpost pages by url or phone.
"""
seen={}
import lxml.html
def scan_contact(data, base_url) :
try :
d = lxml.html.document_fromstring( data )
d.make_links_absolute(base_url)
for (f_name_element, attr , f_link, pos) in d.iterlinks():
f_link= f_link.lower()
if(attr == 'href'):
if f_link.find("email-me") > 0:
print "found email",(f_name_element, attr , f_link, pos)
return f_link
if f_link.find("email") > 0:
print "found email2",(f_name_element, attr , f_link, pos)
return f_link
if f_link.find("contact") > 0:
print "found email4",(f_name_element, attr , f_link, pos)
return f_link
except Exception, e:
print e, "cannot read" , base_url
return None
def scan_rss(data, base_url) :
try :
d = lxml.html.document_fromstring( data )
d.make_links_absolute(base_url)
for (f_name_element, attr , f_link, pos) in d.iterlinks():
f_link=f_link.lower()
if(attr == 'href'):
if f_link.lower().find("rss") > 0:
print "found rss", f_link
return f_link
except Exception, e:
print "rss error",e, "cannot read" , base_url
return None
def index(contacts,n,term,field_list):
global seen
main = cache.cacheweb (term['url'])
for f in field_list :
if f in term :
url= term[f]
else:
newdata = ""
print f, " missing in ",n, term
if f == 'contact_form':
newdata=scan_contact(main,term['url'])
if f == 'rss_url':
newdata=scan_rss(main,term['url'])
if newdata is not None:
newdata=newdata.replace("&","&")
term[f]=newdata
return term
def process():
global _legs
contacts={}
for x in sorted(_legs['wp'].keys()):
t = _legs['wp'][x]['terms'][-1]
t = index(contacts,x,t,['contact_form'])
# t=index(contacts,x,t,['url'])
t=index(contacts,x,t,['rss_url'])
return contacts
def loadlegs():
return leg.load()
def doit():
return process()
_legs=loadlegs()
def save():
dump.dump(_legs)
doit()
save()