forked from dchrastil/ScrapedIn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScrapedIn.py
185 lines (164 loc) · 8.31 KB
/
ScrapedIn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/python
__title__ = "ScrapeIn - Tool to Scrape LinkedIn"
__author__ = 'Danny Chrastil'
__email__ = '[email protected]'
__description__ = "A recon tool that allows you to scrape profile search results from LinkedIn"
__disclaimer__ = "This tool violates TOS of LinkedIn.com. For educational purposes only. Use at your own risk"
__version__ = '2.0'
import sys
import re
import time
import xlsxwriter
import json
import argparse
import requests
import subprocess
import urllib
import math
import config
from bs4 import BeautifulSoup
from thready import threaded
reload(sys)
sys.setdefaultencoding('utf-8')
""" Setup Argument Parameters """
parser = argparse.ArgumentParser(description='Discovery LinkedIn')
parser.add_argument('-u', '--keywords', help='Keywords to search')
parser.add_argument('-o', '--output', help='Output file (do not include extentions)')
args = parser.parse_args()
def linkedIn(proxies=None):
s = requests.Session()
html = s.get("https://www.linkedin.com/", proxies=proxies)
soup = BeautifulSoup(html.text, "html.parser")
csrf = soup.find(id="loginCsrfParam-login")['value']
login_data = {
'session_key': config.linkedin['username'],
'session_password': config.linkedin['password'],
'loginCsrfParam': csrf,
}
logged_in = s.post("https://www.linkedin.com/uas/login-submit",
data=login_data,
proxies=proxies)
soup = BeautifulSoup(logged_in.text, "html.parser")
cookies = s.cookies
return cookies
def get_search(search):
# Fetch the initial page to get results/page counts
#url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=0' % search
#url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v-%%3EPEOPLE,facetGeoRegion-%%3Ear%%3A0)&keywords=%s&origin=FACETED_SEARCH&q=guided&start=0" % search
url = "https://www.linkedin.com/voyager/api/search/cluster"#?count=40&guides=List(v-%%3EPEOPLE,facetGeoRegion-%%3Ear%%3A0)&keywords=%s&origin=FACETED_SEARCH&q=guided&start=0" % search
#url = 'https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->31752)&origin=GLOBAL_SEARCH_HEADER&q=guided&start=0'
#url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->31752)&origin=OTHER&q=guided&start=0"
#url = 'https://www.linkedin.com/search/results/people/?facetCurrentCompany=%5B"75769"%5D'
params = {
'count': 40,
'guides': 'List(v-%>PEOPLE,facetGeoRegion-%>ar%:0)',
'keywords': search,
'origin': 'FACETED_SEARCH',
'q': 'guided',
'start': 0
}
headers = {'Csrf-Token':'ajax:7736867257193100830'}
cookies['JSESSIONID'] = 'ajax:7736867257193100830'
cookies['X-RestLi-Protocol-Version'] = '2.0.0'
r = requests.get(url, cookies=cookies, headers=headers, params=params)
content = json.loads(r.text)
data_total = content['paging']['total']
# Calculate pages off final results at 40 results/page
pages = data_total / 40
if data_total % 40 == 0:
# Becuase we count 0... Subtract a page if there are no left over results on the last page
pages = pages - 1
if pages == 0:
pages = 1
print "[Info] %i Results Found" % data_total
if data_total > 1000:
pages = 24
print "[Notice] LinkedIn only allows 1000 results. Refine keywords to capture all data"
print "[Info] Fetching %i Pages" % pages
print
# Set record position for XLSX
recordpos = 1
for p in range(pages):
# Request results for each page using the start offset
params['start'] = p
r = requests.get(url, cookies=cookies, headers=headers, params=params)
content = r.text.encode('UTF-8')
content = json.loads(content)
print "[Info] Fetching page %i with %i results" % (p+1,len(content['elements'][0]['elements']))
for c in content['elements'][0]['elements']:
try:
if c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['headless'] == False:
try:
data_industry = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['industry']
except:
data_industry = ""
data_firstname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['firstName']
data_lastname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['lastName']
data_slug = "https://www.linkedin.com/in/%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['publicIdentifier']
data_occupation = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['occupation']
data_location = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['location']
try:
data_picture = "https://media.licdn.com/mpr/mpr/shrinknp_400_400%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['picture']['com.linkedin.voyager.common.MediaProcessorImage']['id']
except:
#print "[Notice] No picture found for %s %s, %s" % (data_firstname, data_lastname, data_occupation)
data_picture = ""
# Write data to XLSX file
worksheet1.write('A%i' % recordpos, data_firstname)
worksheet1.write('B%i' % recordpos, data_lastname)
worksheet1.write('C%i' % recordpos, data_occupation)
worksheet1.write('D%i' % recordpos, data_location)
worksheet1.write('E%i' % recordpos, data_industry)
worksheet1.write('F%i' % recordpos, data_slug)
worksheet1.write('G%i' % recordpos, data_picture)
worksheet2.write('A%i' % recordpos, '=IMAGE(dataset!G%i)' % recordpos)
worksheet2.write('B%i' % recordpos, '=dataset!A%i&" "&dataset!B%i&"\n"&dataset!C%i&"\n"&dataset!D%i&"\n"&dataset!E%i' % (recordpos,recordpos,recordpos,recordpos,recordpos))
worksheet2.write('C%i' % recordpos, '=HYPERLINK(dataset!F%i)' % recordpos)
worksheet2.set_row(recordpos-1,125)
# Increment Record Position
recordpos = recordpos + 1
else:
print "[Notice] Headless profile found. Skipping"
except:
print "[Notice] Skipping"
continue
print
def authenticate():
try:
cookies = linkedIn()
print "[Info] Obtained new session: %s" % cookies['li_at']
li_cookie = dict(li_at=cookies['li_at'])
except KeyError as k:
print k
sys.exit('[Fatal] li_at cookie value not found')
except Exception as e:
print e
sys.exit("[Fatal] Could not authenticate to linkedin.")
return li_cookie
if __name__ == '__main__':
title = """
__ _ _____
/ _\ ___ _ __ __ _ _ __ ___ __| | \_ \_ __
\ \ / __| '__/ _` | '_ \ / _ \/ _` | / /\/ '_ \
_\ \ (__| | | (_| | |_) | __/ (_| /\/ /_ | | | |
\__/\___|_| \__,_| .__/ \___|\__,_\____/ |_| |_|
|_|
tool to scrape linkedin v2.0
"""
print title.decode('UTF-8')
# Prompt user for data variables
search = args.keywords if args.keywords!=None else raw_input("Enter search Keywords (use quotes for more percise results)\n")
outfile = args.output if args.output!=None else raw_input("Enter filename for output (exclude file extension)\n")
print
# URL Encode for the querystring
#search = urllib.quote_plus(search)
cookies = authenticate()
# Initiate XLSX File
workbook = xlsxwriter.Workbook('results/%s.xlsx' % outfile)
worksheet1 = workbook.add_worksheet('dataset')
worksheet2 = workbook.add_worksheet('report')
worksheet2.set_column(0,0, 25)
worksheet2.set_column(1,2, 75)
# Initialize Scraping
get_search(search)
# Close XLSD File
workbook.close()