Skip to content
This repository was archived by the owner on Mar 25, 2024. It is now read-only.

Commit 10176a4

Browse files
committedNov 21, 2021
Reformat all code
Adapt scraping to follow google maps
1 parent 0caaec2 commit 10176a4

13 files changed

+802
-600
lines changed
 

‎.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
emails.txt
33
geckodriver.log
44
lib/__pycache__/
5-
venv
5+
venv
6+
*test*

‎assets/banner.txt

-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,3 @@
55
██║╚██╔╝██║██╔══██║██║██║ ██╔══╝ ██║ ██║██║ ██║██║ ██╔══╝
66
██║ ╚═╝ ██║██║ ██║██║███████╗██║ ╚██████╔╝╚██████╔╝███████╗███████╗
77
╚═╝ ╚═╝╚═╝ ╚═╝╚═╝╚══════╝╚═╝ ╚═════╝ ╚═════╝ ╚══════╝╚══════╝
8-

‎lib/digger.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from lib.user import User
2+
from lib.google_people_api import GooglePeopleApi
3+
from time import sleep
4+
5+
6+
class Digger:
7+
def __init__(self, mails, browser):
8+
self.users = None
9+
self._create_users(mails=mails, browser=browser)
10+
11+
def _create_users(self, mails, browser):
12+
self.gpa = GooglePeopleApi(mails=mails)
13+
if not self.gpa.connected:
14+
self.users = [User(mail=mail, browser=browser) for mail in mails]
15+
return
16+
17+
users_info = []
18+
while True:
19+
data = self.gpa.get_data()
20+
users_info.extend(data)
21+
22+
if len(self.gpa.mails) == 0:
23+
break
24+
sleep(2)
25+
26+
self.users = [User(browser=browser, **user_info) for user_info in users_info]
27+
28+
def as_dict(self):
29+
return [user.as_dict() for user in self.users]

‎lib/googlePeopleAPI.py

-48
This file was deleted.

‎lib/google_people_api.py

+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
import os.path
2+
3+
from google.auth.transport.requests import Request
4+
from google.oauth2.credentials import Credentials
5+
from google_auth_oauthlib.flow import InstalledAppFlow
6+
from googleapiclient.discovery import build
7+
8+
9+
class GooglePeopleApi:
10+
def __init__(self, mails):
11+
self.scopes = ["https://www.googleapis.com/auth/contacts"]
12+
self.creds = None
13+
self.service = None
14+
self.mails = mails
15+
try:
16+
self._connect()
17+
print("Connected to Google people API")
18+
self.connected = True
19+
self._importContacts()
20+
except:
21+
print("Cannot connect to Google people API")
22+
print("Retry after deleting 'token.json'")
23+
self.connected = False
24+
25+
def _connect(self):
26+
# Check if 'token.json' exist or not
27+
if os.path.exists("token.json"):
28+
self.creds = Credentials.from_authorized_user_file(
29+
"token.json", self.scopes
30+
)
31+
32+
# If there are no (valid) credentials available, let the user log in
33+
if not self.creds or not self.creds.valid:
34+
if not self.creds:
35+
flow = InstalledAppFlow.from_client_secrets_file(
36+
"credentials.json", self.scopes
37+
)
38+
self.creds = flow.run_local_server(port=0)
39+
elif self.creds.expired and self.creds.refresh_token:
40+
self.creds.refresh(Request())
41+
42+
# Save the credentials for the next run
43+
with open("token.json", "w") as token:
44+
token.write(self.creds.to_json())
45+
46+
# Create service
47+
self.service = build("people", "v1", credentials=self.creds)
48+
49+
def _importContacts(self):
50+
# Import the mail as a contact to the account
51+
for mail in self.mails:
52+
self.service.people().createContact(
53+
body={"emailAddresses": [{"value": mail}]}
54+
).execute()
55+
56+
def _downloadContacts(self):
57+
results = (
58+
self.service.people()
59+
.connections()
60+
.list(
61+
pageSize=1000,
62+
resourceName="people/me",
63+
personFields="names,photos,emailAddresses,metadata",
64+
)
65+
.execute()
66+
)
67+
return results.get("connections", [])
68+
69+
def _deleteContact(self, name):
70+
# Sometimes the google API has trouble deleting the contact
71+
try:
72+
self.service.people().deleteContact(resourceName=name).execute()
73+
# Start again until it succeeds
74+
except:
75+
self._deleteContact(name)
76+
77+
def get_data(self):
78+
connections = self._downloadContacts()
79+
connections = list(
80+
filter(
81+
lambda contact: "emailAddresses" in contact.keys()
82+
and contact["emailAddresses"][0]["value"] in self.mails,
83+
connections,
84+
)
85+
)
86+
87+
users_data = []
88+
for person in connections:
89+
user = {}
90+
mail = person["emailAddresses"][0]["value"]
91+
92+
if mail not in self.mails:
93+
continue
94+
95+
user["mail"] = mail
96+
97+
if len(person["metadata"]["sources"]) > 1:
98+
sources = person["metadata"]["sources"][1]
99+
user["user_type"] = sources["profileMetadata"]["userTypes"][0].replace(
100+
"_", " "
101+
)
102+
user["google_ID"] = sources["id"]
103+
user["profile_pic"] = person["photos"][0]["url"]
104+
105+
self._deleteContact(person["resourceName"])
106+
self.mails.remove(user["mail"])
107+
108+
users_data.append(user)
109+
110+
return users_data

‎lib/maps.py

-272
This file was deleted.

‎lib/maps_scraper.py

+390
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,390 @@
1+
import requests
2+
3+
from bs4 import BeautifulSoup
4+
from lib.selenium_wrapper import SeleniumWrapper
5+
from time import sleep
6+
7+
8+
class MapsScraper:
9+
10+
# Global variable of the seconds to wait to be sure that content is loaded
11+
DELAY = 5
12+
# Set cookie for Google consent and "User Agent"
13+
CONSENT = "YES+cb.20210622-13-p0.fr+F+528"
14+
USER_AGENT = (
15+
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"
16+
)
17+
18+
def __init__(self, google_ID, browser):
19+
self.browser = browser
20+
21+
self.exist = None
22+
self.is_public = None
23+
24+
self.name = None
25+
self.local_guide = None
26+
self.contributions = []
27+
self.reviews = []
28+
self.medias = {}
29+
30+
if google_ID:
31+
self.url = f"https://www.google.com/maps/contrib/{google_ID}"
32+
self._scrap_data()
33+
34+
def as_dict(self):
35+
if not self.exist:
36+
return None
37+
38+
data = {"url": self.url}
39+
if self.local_guide:
40+
data["local_guide"] = self.local_guide
41+
if self.contributions:
42+
data["contributions"] = self.contributions
43+
if self.reviews:
44+
data["reviews"] = self.reviews
45+
if self.medias:
46+
data["medias"] = self.medias
47+
return data
48+
49+
def _scrap_data(self):
50+
51+
self._request()
52+
53+
if self.is_public:
54+
self.driver = SeleniumWrapper(self.browser).driver
55+
self._selenium_scrap()
56+
self.driver.quit()
57+
58+
def _request(self):
59+
# Setting up the request
60+
session = requests.Session()
61+
session.headers.update({"User-Agent": self.USER_AGENT})
62+
consent_cookie = requests.cookies.create_cookie(
63+
domain=".google.com", name="CONSENT", value=self.CONSENT
64+
)
65+
session.cookies.set_cookie(consent_cookie)
66+
67+
# Making the request
68+
maps_request = session.get(self.url)
69+
if maps_request.status_code == 404:
70+
self.exist = False
71+
elif maps_request.status_code == 200:
72+
self.exist = True
73+
self._minimal_scrap(BeautifulSoup(maps_request.text, "html.parser"))
74+
75+
def _minimal_scrap(self, html):
76+
title = html.find("meta", attrs={"property": "og:title"})["content"].split(
77+
"by "
78+
)
79+
if len(title) != 1:
80+
self.is_public = True
81+
self.name = title[1]
82+
else:
83+
self.is_public = False
84+
return
85+
86+
description = html.find("meta", attrs={"property": "og:description"})[
87+
"content"
88+
].split(" Local Guide | ")
89+
if len(description) != 1:
90+
self.local_guide = {
91+
"level": int(description[0].replace("Level ", "")),
92+
"points": int(description[1].replace(" Points", "").replace(",", "")),
93+
}
94+
else:
95+
self.contributions = int(
96+
description[0]
97+
.replace(" Contributions", "")
98+
.replace(" Contribution", "")
99+
)
100+
101+
def selenium_scroll(self, here):
102+
# Define end of scrolling
103+
element = here.find_elements_by_xpath(
104+
"//div[@data-review-id] | //div[@data-photo-bucket-id]"
105+
)
106+
if not element:
107+
return
108+
109+
element = element[-1]
110+
while True:
111+
# Scroll
112+
element.location_once_scrolled_into_view
113+
# Be sure to load the page
114+
sleep(3)
115+
# Find last <div> of the section
116+
nextElement = here.find_elements_by_xpath(
117+
"//div[@data-review-id] | //div[@data-photo-bucket-id]"
118+
)[-1]
119+
# Do it again if not at the end, else break the loop
120+
if nextElement == element:
121+
return
122+
element = nextElement
123+
124+
def _selenium_scrap(self):
125+
126+
# Open URL with Selenium
127+
self.driver.get(self.url)
128+
129+
# Automate accepting cookies
130+
cookies_button = self.driver.find_elements_by_xpath(
131+
"//form[@action='https://consent.google.com/s']//button"
132+
)[0]
133+
cookies_button.click()
134+
135+
# Be sure to load the page
136+
sleep(self.DELAY)
137+
138+
# Open contributions panel
139+
contributions = self.driver.find_elements_by_xpath(
140+
"//span[@jsaction='pane.profile-stats.showStats;keydown:pane.profile-stats.showStats']"
141+
)[0]
142+
contributions.click()
143+
144+
# Be sure to load the page
145+
sleep(self.DELAY)
146+
147+
# Get informations from the contribution panel
148+
contributions_content = self.driver.find_elements_by_xpath(
149+
"//div[@id='modal-dialog']//h1/../../div"
150+
)
151+
contributions_header = contributions_content[0]
152+
contributions_points = contributions_content[2]
153+
contributions_stats = contributions_content[3].text.split("\n")[1::2]
154+
155+
# Scrap 'Level' and 'Points' if target is a 'Local Guide'
156+
if contributions_points.text:
157+
self.local_guide = {}
158+
self.local_guide["level"] = int(contributions_header.text.split()[-1])
159+
self.local_guide["points"] = int(
160+
contributions_points.text.replace("\u202f", "").split("\n")[0]
161+
)
162+
163+
# Add all the differents contributions statistics to a list
164+
self.contributions = {}
165+
self.contributions["reviews"] = int(contributions_stats[0])
166+
self.contributions["ratings"] = int(contributions_stats[1])
167+
self.contributions["photos"] = int(contributions_stats[2])
168+
self.contributions["videos"] = int(contributions_stats[3])
169+
self.contributions["answers"] = int(contributions_stats[4])
170+
self.contributions["edits"] = int(contributions_stats[5])
171+
self.contributions["placesAdded"] = int(contributions_stats[6])
172+
self.contributions["roadsAdded"] = int(contributions_stats[7])
173+
self.contributions["factsChecked"] = int(contributions_stats[8])
174+
self.contributions["q&a"] = int(contributions_stats[9])
175+
self.contributions["publishedLists"] = int(contributions_stats[10])
176+
177+
# Close contributions panel
178+
self.driver.find_elements_by_xpath(
179+
"//div[@id='modal-dialog']//button[@jsaction='modal.close']"
180+
)[0].click()
181+
182+
# Be sure to load the page
183+
sleep(self.DELAY)
184+
185+
# Checking if there are some ratings or reviews to scrap
186+
if self.contributions["reviews"] or self.contributions["ratings"]:
187+
188+
# Click on the review's panel
189+
review_panel = self.driver.find_elements_by_xpath(
190+
"//div[@role='tablist']/button[1]"
191+
)[0]
192+
review_panel.click()
193+
194+
# Be sure to load the page
195+
sleep(self.DELAY)
196+
197+
# Scroll in the layout section to load all the reviews to scrap
198+
divs = self.driver.find_elements_by_xpath("//div")
199+
layout_section = [
200+
scrollbox_section
201+
for scrollbox_section in divs
202+
if "section-scrollbox" in scrollbox_section.get_attribute("class")
203+
][0]
204+
self.selenium_scroll(layout_section)
205+
206+
# Scrap each review
207+
self.reviews = []
208+
for mpReview in layout_section.find_elements_by_xpath(
209+
"//div[@role='button']/div[@data-review-id]"
210+
):
211+
review = {}
212+
# Separate title from content
213+
title = mpReview.find_elements_by_xpath("div[@class]/div[@class]")[
214+
0
215+
].text.split("\n")
216+
content = mpReview.find_elements_by_xpath("div[@class]/div[@class]")[1]
217+
218+
# Click on the 'Plus' button to load all the text
219+
plus_button = content.find_elements_by_xpath("//jsl/button")
220+
if plus_button:
221+
plus_button[0].click()
222+
223+
# From title
224+
review["place"] = title[0]
225+
if len(title) > 1:
226+
review["address"] = title[1]
227+
228+
# From content
229+
firstLine = content.find_elements_by_xpath("./div")[0]
230+
231+
# Elements always in content
232+
review["stars"] = int(
233+
firstLine.find_elements_by_xpath("./span[@class]")[0]
234+
.get_attribute("aria-label")
235+
.split("\xa0")[0]
236+
.replace(" ", "")
237+
)
238+
review["when"] = firstLine.find_elements_by_xpath("./span[@class]")[
239+
1
240+
].text
241+
242+
# Elements not there every time
243+
try: # Comment of the target
244+
nextLine = firstLine.find_elements_by_xpath("../div[@class]")[1]
245+
if nextLine.text != "":
246+
review["comment"] = nextLine.text
247+
except:
248+
pass
249+
try: # "Visited in..." or "Owner's Response"
250+
nextLine = nextLine.find_elements_by_xpath("../div[@class]")[3]
251+
# Case with "Like" & "Share" instead of "Visited in..."
252+
if not nextLine.find_elements_by_xpath("./button"):
253+
# Case with "Owner's response" instead of "Visited in..."
254+
if "title" not in nextLine.find_elements_by_xpath("./span")[
255+
0
256+
].get_attribute("class"):
257+
review["visited"] = nextLine.text
258+
else:
259+
review["ownersResponse"] = nextLine.text
260+
except:
261+
pass
262+
263+
self.reviews.append(review)
264+
265+
# Check if there are some media to scrap to
266+
if self.contributions["photos"] or self.contributions["videos"]:
267+
# Going back to photos panel
268+
medias_panel = self.driver.find_elements_by_xpath(
269+
"//div[@role='tablist']/button[2]"
270+
)[0]
271+
medias_panel.click()
272+
273+
# Be sure to load the page
274+
sleep(self.DELAY)
275+
276+
# Scroll in the layout section to load all the medias to scrap
277+
divs = self.driver.find_elements_by_xpath("//div")
278+
layout_section = [
279+
scrollbox_section
280+
for scrollbox_section in divs
281+
if "section-scrollbox" in scrollbox_section.get_attribute("class")
282+
][0]
283+
self.selenium_scroll(layout_section)
284+
285+
try:
286+
# Scrap the number of times the medias has been seen by people
287+
self.medias["views"] = int(
288+
layout_section.find_elements_by_xpath("div")[0]
289+
.text.replace("\u202f", "")
290+
.split("\n")[0]
291+
.split(" ")[1]
292+
)
293+
self.medias = {}
294+
except IndexError:
295+
# Medias are mentioned in contributions panel but none are scrapable
296+
return
297+
298+
# Scrap each post with media
299+
self.medias["content"] = []
300+
for content in layout_section.find_elements_by_xpath(
301+
".//div[@role='button']"
302+
):
303+
media = {}
304+
media["medias"] = []
305+
306+
# Get the place and the address of the post
307+
place_and_address = content.text.split("\n")
308+
309+
# Add the place and the address
310+
media["place"] = place_and_address[0]
311+
try: # When place is "Unknown place" but had medias posted on it
312+
media["address"] = place_and_address[1]
313+
except:
314+
pass
315+
316+
# For each media in the post
317+
for med in content.find_elements_by_xpath(".//jsl"):
318+
319+
# If the media is picture, "play button" is not displayed
320+
if (
321+
med.find_elements_by_xpath("./div/div")[-1].get_attribute(
322+
"style"
323+
)
324+
== "display: none;"
325+
):
326+
327+
img = None
328+
while not img: # Waiting the picture to be loaded
329+
try:
330+
img = med.find_elements_by_xpath(".//img")[
331+
0
332+
].get_attribute("src")
333+
except:
334+
pass
335+
336+
# Add its source to the array
337+
media["medias"].append(img)
338+
339+
else: # The media is a video
340+
341+
# Click on the thumbnail to load the video in a new iFrame
342+
med.find_elements_by_xpath(".//img/..")[0].click()
343+
344+
# Be sure to load the iFrame
345+
sleep(self.DELAY)
346+
347+
# Find the iFrame and switch to it
348+
iframe = self.driver.find_elements_by_xpath(
349+
"//iframe[@class='widget-scene-imagery-iframe']"
350+
)[0]
351+
self.driver.switch_to.frame(iframe)
352+
353+
vid = None
354+
while not vid: # Waiting the video to be loaded
355+
try:
356+
vid = self.driver.find_elements_by_xpath("//video")[
357+
0
358+
].get_attribute("src")
359+
except:
360+
pass
361+
362+
# Switch back to the default DOM
363+
self.driver.switch_to.default_content()
364+
365+
# Add its source to the array
366+
media["medias"].append(vid)
367+
368+
self.medias["content"].append(media)
369+
370+
@property
371+
def nb_contributions(self):
372+
return sum(self.contributions[what] for what in self.contributions)
373+
374+
@property
375+
def nb_medias(self):
376+
if "content" not in self.medias:
377+
return 0
378+
return sum(len(c["medias"]) for c in self.medias["content"])
379+
380+
@property
381+
def nb_reviews_ratings(self):
382+
return len(self.reviews)
383+
384+
@property
385+
def nb_displayed_reviews_ratings(self):
386+
return self.contributions["reviews"] + self.contributions["ratings"]
387+
388+
@property
389+
def nb_displayed_medias(self):
390+
return self.contributions["photos"] + self.contributions["videos"]

‎lib/seleniumWrapper.py

-30
This file was deleted.

‎lib/selenium_wrapper.py

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from selenium import webdriver
2+
from selenium.webdriver.chrome.options import Options as ChromeOptions
3+
from selenium.webdriver.firefox.options import Options as FirefoxOptions
4+
from sys import platform
5+
6+
7+
class SeleniumWrapper:
8+
def __init__(self, browser):
9+
self.browser = browser
10+
self.driver = None
11+
DRIVER_PATH = ""
12+
13+
# Detect OS
14+
if platform.startswith("linux"):
15+
EXT = ""
16+
elif platform.startswith("win32"):
17+
EXT = ".exe"
18+
else:
19+
raise Exception(
20+
"The use of selenium is not supported for this OS. "
21+
'Only "linux" and "win32" are possible\n',
22+
"Scrapping only the name and the number of contributions "
23+
"from Google Maps public profile",
24+
sep="\n",
25+
)
26+
27+
# Choose the good driver
28+
if self.browser == "chrome":
29+
options = ChromeOptions()
30+
DRIVER_PATH = f"./drivers/chromedriver{EXT}"
31+
self.driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
32+
33+
elif self.browser == "firefox":
34+
options = FirefoxOptions()
35+
DRIVER_PATH = f"./drivers/geckodriver{EXT}"
36+
self.driver = webdriver.Firefox(
37+
options=options, executable_path=DRIVER_PATH
38+
)

‎lib/user.py

+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
from lib.maps_scraper import MapsScraper
2+
from lib.youtube_scraper import YoutubeScraper
3+
4+
5+
class User:
6+
def __init__(
7+
self, browser, mail=None, user_type=None, google_ID=None, profile_pic=None
8+
):
9+
self.mail = mail
10+
self.user_type = user_type
11+
self.google_ID = google_ID
12+
self.profile_pic = profile_pic
13+
self.username = self.mail.split("@")[0]
14+
15+
self.data_maps = MapsScraper(
16+
google_ID=google_ID,
17+
browser=browser,
18+
)
19+
20+
self.data_youtube = YoutubeScraper(self.username)
21+
22+
self.print_informations()
23+
24+
def print_informations(self):
25+
self._print_global_info()
26+
self._print_maps_info()
27+
self._print_youtube_info()
28+
29+
def _print_global_info(self):
30+
print(
31+
f"\n{self.mail} : "
32+
f"{self.user_type if self.user_type else 'NOT A GOOGLE USER'}\n"
33+
)
34+
if self.name:
35+
print(f"\tName : {self.name}")
36+
if self.google_ID:
37+
print(f"\tGoogle ID : {self.google_ID}")
38+
if self.profile_pic:
39+
print(f"\tProfile picture : {self.profile_pic}")
40+
41+
def _print_maps_info(self):
42+
if not self.data_maps.exist:
43+
return
44+
45+
print(f"\n\tMaps Contributions & Reviews ({self.data_maps.url})")
46+
47+
if not self.data_maps.is_public:
48+
print("\tProfile is private, can't scrap informations from it")
49+
return
50+
51+
if self.data_maps.local_guide:
52+
print(
53+
f"\t\tLocal Guide level {self.data_maps.local_guide['level']} with "
54+
f"{self.data_maps.local_guide['points']} points"
55+
)
56+
57+
if self.data_maps.contributions:
58+
print(
59+
f"\t\t{self.data_maps.nb_contributions} contributions including "
60+
f"{self.data_maps.nb_displayed_reviews_ratings} reviews & ratings and "
61+
f"{self.data_maps.nb_displayed_medias} medias"
62+
)
63+
print(
64+
"\t\t\t"
65+
+ " " * len(str(self.data_maps.nb_contributions))
66+
+ f"scrapped in fact {self.data_maps.nb_reviews_ratings} "
67+
f"reviews & ratings and {self.data_maps.nb_medias} medias"
68+
)
69+
70+
def _print_youtube_info(self):
71+
if not self.data_youtube.found:
72+
print(f'\n\tYouTube : User "{self.username}" not found')
73+
return
74+
75+
print(f'\n\tYouTube : User "{self.data_youtube.username}" found !')
76+
creation = self.data_youtube.creation
77+
creation_date = creation[: len(creation) - 6].replace("T", " ")
78+
print(
79+
f'\t\tChannel named "{self.data_youtube.channel}" '
80+
f"created {creation_date}"
81+
)
82+
print(f"\t\t{self.data_youtube.url}")
83+
print(
84+
f"\t\t{sum(video['views'] for video in self.data_youtube.videos)} "
85+
f"cumulative views on {len(self.data_youtube.videos)} "
86+
"last posted video(s)"
87+
)
88+
89+
def as_dict(self):
90+
data = {"mail": self.mail}
91+
if self.name:
92+
data["name"] = self.name
93+
if self.user_type:
94+
data["user_type"] = self.user_type
95+
if self.google_ID:
96+
data["google_ID"] = self.google_ID
97+
if self.profile_pic:
98+
data["profile_pic"] = self.profile_pic
99+
if self.data_maps:
100+
data["maps"] = self.data_maps.as_dict()
101+
if self.data_youtube:
102+
data["youtube"] = self.data_youtube.as_dict()
103+
return data
104+
105+
@property
106+
def name(self):
107+
return self.data_maps.name

‎lib/youtube.py

-54
This file was deleted.

‎lib/youtube_scraper.py

+88
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import requests
2+
3+
from bs4 import BeautifulSoup
4+
5+
6+
class YoutubeScraper:
7+
def __init__(self, username):
8+
self.username = username
9+
self.url = f"https://www.youtube.com/feeds/videos.xml?user={self.username}"
10+
11+
self.found = None
12+
13+
self.channel = None
14+
self.creation = None
15+
self.videos = None
16+
self._scrap_data()
17+
18+
def _scrap_data(self):
19+
# Making the request
20+
youtube_request = requests.get(self.url)
21+
22+
if youtube_request.status_code == 404: # Not found
23+
self.found = False
24+
25+
elif youtube_request.status_code == 200: # Found
26+
27+
html = BeautifulSoup(youtube_request.text, "html.parser")
28+
29+
# Get informations of the account
30+
self.channel = html.title.string
31+
self.url = html.title.find_next_sibling("link").get("href")
32+
self.creation = html.published.string
33+
34+
# Get informations of each video
35+
videos = []
36+
for vid in html.find_all("entry"):
37+
video = {}
38+
39+
video["title"] = vid.find("title").string
40+
video["link"] = vid.find("link").get("href")
41+
video["thumbnail"] = (
42+
vid.find("media:group").find("media:thumbnail").get("url")
43+
)
44+
video["description"] = (
45+
vid.find("media:group").find("media:description").string
46+
)
47+
video["published"] = vid.find("published").string
48+
video["updated"] = vid.find("updated").string
49+
video["views"] = int(
50+
vid.find("media:group")
51+
.find("media:community")
52+
.find("media:statistics")
53+
.get("views")
54+
)
55+
video["thumbUp"] = int(
56+
vid.find("media:group")
57+
.find("media:community")
58+
.find("media:starrating")
59+
.get("count")
60+
)
61+
62+
# YouTube give a note based on a ratio of thumbs up and down ('star')
63+
if video["thumbUp"] != "0":
64+
video["stars"] = float(
65+
vid.find("media:group")
66+
.find("media:community")
67+
.find("media:starrating")
68+
.get("average")
69+
)
70+
71+
videos.append(video)
72+
73+
self.videos = videos
74+
75+
def as_dict(self):
76+
data = {"username": self.username}
77+
78+
if self.found:
79+
data["url"] = self.url
80+
if self.channel:
81+
data["channel"] = self.channel
82+
if self.url:
83+
data["url"] = self.url
84+
if self.creation:
85+
data["creation"] = self.creation
86+
if self.videos:
87+
data["videos"] = self.videos
88+
return data

‎mailfogle.py

+38-194
Original file line numberDiff line numberDiff line change
@@ -1,237 +1,81 @@
1-
import json
21
import argparse
3-
from sys import exit
4-
from time import sleep
5-
from lib.maps import mapsData
6-
from lib.youtube import youtubeData
7-
import lib.googlePeopleAPI as gpa
8-
9-
def printBanner():
10-
for line in open("assets/banner.txt","r"):
11-
print(line.replace("\n",""))
12-
13-
def printInformations(datas: dict):
14-
"""Print the main informations."""
15-
16-
user_type = (
17-
['NOT A GOOGLE USER'] if 'userTypes' not in datas
18-
else [ut.replace('_',' ') for ut in datas['userTypes']]
19-
)
20-
print(f"\n{datas['mail']} : {', '.join(user_type)}\n")
21-
22-
# GOOGLE USER
23-
if "userTypes" in datas:
24-
if "name" in datas: print(f"\tName : {datas['name']}")
25-
print(
26-
f"\tGoogle ID : {datas['googleID']}" +
27-
f"\n\tProfile picture : {datas['profilePic']}"
28-
)
29-
30-
# Maps profile not private
31-
if "maps" in datas:
32-
print(f"\n\tMaps Contributions & Reviews ({datas['maps']['url']})")
33-
34-
if "localGuide" in datas['maps']:
35-
level = datas['maps']['localGuide']['level']
36-
points = datas['maps']['localGuide']['points']
37-
print(f"\t\tLocal Guide level {level} with {points} points")
38-
39-
if isinstance(datas['maps']['contributions'],dict):
40-
nbContrib = sum(
41-
datas['maps']['contributions'][what]
42-
for what in datas['maps']['contributions']
43-
)
44-
reviews_ratings = (
45-
datas['maps']['contributions']['reviews'] +
46-
datas['maps']['contributions']['ratings']
47-
)
48-
medias = (
49-
datas['maps']['contributions']['photos'] +
50-
datas['maps']['contributions']['videos']
51-
)
52-
print(
53-
f"\t\t{nbContrib} contributions including " +
54-
f"{reviews_ratings} reviews & ratings and {medias} medias"
55-
)
56-
57-
count = 0
58-
if datas['maps']['contributions']['photos'] or datas['maps']['contributions']['videos']:
59-
count = sum(
60-
len(c['medias'])
61-
for c in datas['maps']['medias']['content']
62-
)
63-
reviews_ratings = (
64-
len(datas['maps']['reviews'])
65-
if 'reviews' in datas['maps'] else 0
66-
)
67-
print(
68-
"\t\t\t" + " "*len(str(nbContrib)) +
69-
f"scrapped in fact {reviews_ratings} reviews & ratings " +
70-
f"and {count} medias"
71-
)
72-
73-
else:
74-
print(
75-
f"\t\t{datas['maps']['contributions']} contributions" +
76-
"/!\\ This data is sometimes wrong. " +
77-
"Configure Selenium to scrap more accurate informations /!\\"
78-
)
79-
80-
else:
81-
print(
82-
"\n\tGoogle maps profile is private, " +
83-
"can\'t scrap informations from it"
84-
)
85-
86-
# YouTube informations
87-
if "youtube" in datas:
88-
print(
89-
f"\tYouTube : User \"{datas['youtube']['username']}\" found " +
90-
"/!\\ Maybe not the one you're looking for /!\\"
91-
)
92-
creation = datas['youtube']['creation']
93-
creation_date = creation[:len(creation)-6].replace('T',' ')
94-
print(
95-
f"\t\tChannel \"{datas['youtube']['channel']}\" created {creation_date}"
96-
)
97-
print(f"\t\t{datas['youtube']['url']}")
98-
print(
99-
f"\t\t{sum(video['views'] for video in datas['youtube']['videos'])} " +
100-
f"cumulative views on {len(datas['youtube']['videos'])} " +
101-
"last posted video(s) found"
102-
)
103-
104-
def main(mails,output,browser):
105-
106-
apiFlag = False
107-
try:
108-
gpa.connect()
109-
apiFlag = True
110-
print("Connected to Google people API")
111-
except:
112-
print("Cannot connect to Google people API")
113-
print("Retry after deleting \"token.json\"")
114-
115-
datas = []
116-
117-
if apiFlag:
118-
119-
gpa.importContacts(mails)
120-
while True:
121-
122-
connections = gpa.downloadContacts()
123-
connections = list(filter(
124-
lambda contact : "emailAddresses" in contact.keys()
125-
and contact['emailAddresses'][0]['value'] in mails,
126-
connections,
127-
))
128-
129-
for person in connections:
130-
data = {}
131-
mail = person['emailAddresses'][0]['value']
132-
133-
if mail in mails:
134-
data['mail'] = mail
135-
if len(person['metadata']['sources']) > 1:
136-
sources = person['metadata']['sources'][1]
137-
data['userTypes'] = sources['profileMetadata']['userTypes']
138-
data['googleID'] = sources['id']
139-
data['profilePic'] = person['photos'][0]['url']
140-
141-
mpDatas = mapsData(
142-
url=(
143-
"https://www.google.com/maps/contrib/" +
144-
data['googleID']
145-
),
146-
browser=browser,
147-
)
148-
if mpDatas: # If profile is public
149-
data['maps'] = mpDatas
150-
data['name'] = data['maps']['name']
151-
data['maps'].pop("name")
152-
153-
ytDatas = youtubeData(mail.split("@")[0])
154-
if ytDatas: data['youtube'] = ytDatas
2+
import json
1553

156-
printInformations(data)
4+
from lib.digger import Digger
1575

158-
gpa.deleteContact(person['resourceName'])
159-
mails.pop(mails.index(mail))
1606

161-
datas.append(data)
7+
def main(mails, output, browser):
1628

163-
if len(mails) == 0: break
164-
sleep(2)
9+
data = Digger(mails, browser)
16510

166-
else:
167-
for mail in mails:
168-
ytDatas = youtubeData(mail.split("@")[0])
169-
data = {"mail" : mail}
170-
if ytDatas :
171-
data["youtube"] = ytDatas
172-
printInformations(data)
173-
datas.append(ytDatas)
11+
with open((f"./{output}.json"), "w") as f:
12+
json.dump(data.as_dict(), f, indent=2, ensure_ascii=False)
17413

175-
with open((f"./{output}.json"),"w") as f:
176-
json.dump(datas,f, indent=2)
17714

17815
if __name__ == "__main__":
179-
18016
parser = argparse.ArgumentParser(
181-
description="Explore and scrap user\'s public data from Google account"
17+
description="Explore and scrap user's public data from Google account",
18218
)
18319
parser.add_argument(
18420
"-e",
185-
# metavar="EMAIL",
21+
"--email",
18622
dest="email",
18723
type=str,
18824
nargs="?",
18925
default=None,
190-
help="target\'s mail"
26+
help="target's mail",
19127
)
19228
parser.add_argument(
19329
"-f",
30+
"--file",
19431
dest="file",
19532
type=str,
19633
nargs="?",
19734
default=None,
198-
help="path to a file listing the email addresses of the targets"
35+
help="path to a file listing the email addresses of the targets",
19936
)
20037
parser.add_argument(
20138
"-o",
39+
"--output",
20240
dest="output",
20341
type=str,
20442
nargs="?",
43+
required=False,
20544
default="output",
206-
help="choose output name (default is \"output\")",
45+
help="choose output name (default is 'output')",
20746
)
20847
parser.add_argument(
20948
"-b",
49+
"--browser",
21050
dest="browser",
211-
choices=["firefox","chrome"],
51+
type=str.lower,
52+
choices=["firefox", "chrome"],
53+
required=False,
21254
default="firefox",
213-
help="select browser \"chrome\" or \"firefox\" (default is \"firefox\")",
55+
help='select browser "chrome" or "firefox" (default is "firefox")',
56+
)
57+
parser.add_argument(
58+
"--no-banner",
59+
dest="nobanner",
60+
required=False,
61+
default=False,
62+
action="store_true",
63+
help="doesn't display banner",
21464
)
21565
args = parser.parse_args()
21666

217-
printBanner()
67+
if not args.nobanner:
68+
print(open("assets/banner.txt", "r").read())
21869

219-
mails = []
220-
221-
if args.email: mails.append(args.email)
222-
if args.file: mails.extend(open(args.file).read().splitlines())
70+
if not (args.email or args.file):
71+
parser.error("Please specify email(s) to dig")
22372

224-
if not mails:
225-
exit(
226-
"Please specify target\'s mail\n" +
227-
"mailfogle.py [-h] for more informations"
228-
)
73+
mails = []
74+
if args.email:
75+
mails.append(args.email)
76+
if args.file:
77+
mails.extend(open(args.file).read().splitlines())
22978

230-
if args.browser.lower() not in ['firefox','chrome']:
231-
exit(
232-
"Please choose a browser between \"Firefox\" and \"Chrome\"\n" +
233-
"mailfogle.py [-h] for more informations"
234-
)
235-
else: browser = args.browser.lower()
79+
browser = args.browser
23680

237-
main(mails=mails,output=args.output,browser=browser)
81+
main(mails=mails, output=args.output, browser=browser)

0 commit comments

Comments
 (0)
This repository has been archived.