-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathimm_scraper.py
127 lines (107 loc) · 4.39 KB
/
imm_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!bin/bash/python
import pandas as pd
import requests
import lxml.html
from bs4 import BeautifulSoup
from selenium import webdriver
from time import sleep
from random import randint
import datetime
'''
class ImmScraper():
def __init__(self):
self.url_to_scrape = "http://trac.syr.edu/phptools/immigration/detain/"
self.page_items=[]
self.all_items=[]
# headless driver
def start_driver(self):
print('starting driver...')
self.driver = webdriver.Firefox()
def close_driver(self):
print('closing driver...')
self.driver.quit()
self('closed!')
def get_page(self,url):
def get_page(self,url):
print('getting page...')
self.driver.get(url)
sleep(randint(.5,1))
def click_page
'''
url = "http://trac.syr.edu/phptools/immigration/detain/"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')#, from_encoding="utf-8")
driver = webdriver.Firefox()
driver.get(url)
print driver.title
html = driver.page_source
html = BeautifulSoup(html,'lxml')
sleep(6)
# This identifies the current things that are clicked on and returns them as a dict
def scrape_details():
ice_dict = {}
# Get state and jurisdiction selected by click in the table
state_xpath = "//div[@id='col1']/table/tbody/tr[@class='rowsel']/td/a"
jurisdiction_xpath = "//div[@id='col2']/table/tbody/tr[@class='rowsel']/td/a"
ice_dict['State'] = driver.find_element_by_xpath(state_xpath).text
ice_dict['Jurisdiction'] = driver.find_element_by_xpath(jurisdiction_xpath).text
# Break apart the county and facility to roll up sheriffs
if ' - ' in ice_dict["Jurisdiction"]:
ice_dict['County'] = ice_dict['Jurisdiction'].split(' - ')[0]
ice_dict['Facility'] = ice_dict['Jurisdiction'].split(' - ')[1]
else:
ice_dict['County'] = ice_dict['Jurisdiction']
ice_dict['Facility'] = ice_dict['Jurisdiction']
# Get the money table, "yes" means ICE took custody of detainee
for i in range(len(driver.find_elements_by_xpath('//*[@id="col3"]/table/tbody/tr/td'))/2):
# this is in a loop because this xpath returns full table of elements, sometimes 4, sometimes 6
# here we ask it for each of
label_xpath = '//*[@id="col3"]/table/tbody/tr[{}]/td[1]'.format(i+1)
num_xpath = '//*[@id="col3"]/table/tbody/tr[{}]/td[2]'.format(i+1)
label = driver.find_element_by_xpath(label_xpath).text
num = driver.find_element_by_xpath(num_xpath).text
ice_dict[label] = num
return ice_dict
# Gets the scraped info calling prior function for each jurisdiction, and returns df
def get_jurisdictions():
df = pd.DataFrame(columns = ['State','Jurisdiction','All','Yes','No','County','Facility'])
success_counter = 0
failure_counter = 0
sleep(5)
for i in driver.find_elements_by_xpath("//div[@id='col2']/table/tbody/tr/td/a"):
i.click() # click on it
sleep(.5) # just to make sure the page gets a chance to load
try:
x = scrape_details() # collect the info
df.loc[len(df)] = pd.DataFrame(x,index=[0]).loc[0] # add it to the df
success_counter+=1
except:
print i.text,"failed"
failure_counter+=1
pass
print "{} succeeded".format(success_counter)
print "{} failed".format(failure_counter)
print "----------------"
return df
all_jur_xpath = '//*[@id="col1"]/table/tbody/tr[1]/td/a' # want to exclude this
all_state_xpath_template = '//*[@id="col1"]/table/tbody/tr[{}]/td/a'
# Run the script
full_nation_df = pd.DataFrame(columns = ['State','Jurisdiction','All','Yes','No','County','Facility'])
for state in range(2,58): # starts at 2 because element #1 is "all"
a = driver.find_element_by_xpath(all_state_xpath_template.format(state))
state_name = a.text
print state_name
a.click()
state_df = get_jurisdictions()
state_df.to_csv('./state_files/{}_ice_detainees.csv'.format(state_name),index=False) # why not save each state?
full_nation_df = full_nation_df.append(state_df,ignore_index=True)
#format datetime
def get_year_date():
month = time.strftime("%b")
day = time.strftime("%d")
year = time.strftime("%Y")
datestring = ("{}-{}-{}".format(day,month,year))
return datestring
datestring = get_year_date()
full_nation_df.to_csv('./data/ice_detainees_{}.csv'.format(datestring),index=False)
driver.quit()