Repository: cullenwatson/StaffSpy Branch: main Commit: 0a8a8d73a5db Files: 28 Total size: 107.0 KB Directory structure: gitextract_26g2vb8c/ ├── .github/ │ └── workflows/ │ └── publish-to-pypi.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── examples/ │ ├── daily_auto_connect.py │ ├── upload_staff_to_clay.py │ └── x_corp_staff.py ├── pyproject.toml └── staffspy/ ├── __init__.py ├── linkedin/ │ ├── certifications.py │ ├── comments.py │ ├── contact_info.py │ ├── employee.py │ ├── employee_bio.py │ ├── experiences.py │ ├── languages.py │ ├── linkedin.py │ ├── schools.py │ └── skills.py ├── solvers/ │ ├── capsolver.py │ ├── solver.py │ ├── solver_type.py │ └── two_captcha.py └── utils/ ├── driver_type.py ├── exceptions.py ├── models.py └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/publish-to-pypi.yml ================================================ name: Publish Python 🐍 distributions 📦 to PyPI on: push jobs: build-n-publish: name: Build and publish Python 🐍 distributions 📦 to PyPI runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: Set up Python uses: actions/setup-python@v4 with: python-version: "3.10" - name: Install poetry run: >- python3 -m pip install poetry --user - name: Build distribution 📦 run: >- python3 -m poetry build - name: Publish distribution 📦 to PyPI if: startsWith(github.ref, 'refs/tags') uses: pypa/gh-action-pypi-publish@release/v1 with: password: ${{ secrets.PYPI_API_TOKEN }} ================================================ FILE: .gitignore ================================================ /venv/ /.idea **/__pycache__/ **/.pytest_cache/ /.ipynb_checkpoints/ **/output/ **/.DS_Store *.pyc .env dist ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: https://github.com/psf/black rev: 24.2.0 hooks: - id: black language_version: python args: [--line-length=88, --quiet] ================================================ FILE: LICENSE ================================================ DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE Version 2, December 2004 Copyright (C) 2004 Sam Hocevar Everyone is permitted to copy and distribute verbatim or modified copies of this license document, and changing it is allowed as long as the name is changed. DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. You just DO WHAT THE FUCK YOU WANT TO. ================================================ FILE: README.md ================================================ 3FAD4652-488F-4F6F-A744-4C2AA5855E92 **StaffSpy** is a staff fetcher library for LinkedIn. ## Features - Fetches staff from a company on **LinkedIn** - Obtains skills, experiences, certifications & more - Fetch individuals users / comments on posts - Export all your connections with their contact info - Aggregates the employees in a Pandas DataFrame ### Installation ``` pip install -U "staffspy[browser]" ``` Or for latest code from this repo directly ``` pip install "git+https://github.com/cullenwatson/StaffSpy.git#egg=staffspy[browser]" ``` _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_ ### Usage ```python from staffspy import LinkedInAccount, SolverType, DriverType, BrowserType account = LinkedInAccount( # driver_type=DriverType( # if issues with webdriver, specify its exact location, download link in the FAQ # browser_type=BrowserType.CHROME, # executable_path="/Users/pc/chromedriver-mac-arm64/chromedriver" # ), session_file="session.pkl", # save login cookies to only log in once (lasts a week or so) log_level=1, # 0 for no logs ) # search by company staff = account.scrape_staff( company_name="openai", search_term="software engineer", location="london", extra_profile_data=True, # fetch all past experiences, schools, & skills max_results=50, # can go up to 1000 # block=True # if you want to block the user after scraping, to exclude from future search results # connect=True # if you want to connect with the users until you hit your limit ) # or fetch by user ids users = account.scrape_users( user_ids=['williamhgates', 'rbranson', 'jeffweiner08'] # connect=True, # block=True ) # fetch all comments on two of Bill Gates' posts comments = account.scrape_comments( ['7252421958540091394','7253083989547048961'] ) # fetch company details companies = account.scrape_companies( company_names=['openai', 'microsoft'] ) # fetch connections (also gets their contact info if available) connections = account.scrape_connections( extra_profile_data=True, max_results=50 ) # export any of the results to csv staff.to_csv("staff.csv", index=False) ``` #### Browser login If you rather use a browser to log in, install the browser add-on to StaffSpy . `pip install staffspy[browser]` If you do not pass the `username` & `password` params, then a browser will open to sign in to LinkedIn on the first sign-in. Press enter after signing in to begin scraping. ### Output | profile_id | name | first_name | last_name | location | age | position | followers | connections | company | past_company1 | past_company2 | school1 | school2 | skill1 | skill2 | skill3 | is_connection | premium | creator | potential_email | profile_link | profile_photo | | ---------------- | -------------- | ---------- | --------- | ------------------------------- | --- | ------------------------------- | --------- | ----------- | ------- | ------------- | ------------- | ---------------------------------- | ------------------------- | -------- | ---------- | ---------- | ------------- | ------- | ------- | ------------------------------------------------ | -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | javiersierra2102 | Javier Sierra | Javier | Sierra | London, England, United Kingdom | 39 | Software Engineer | 735 | 725 | OpenAI | Meta | Oculus VR | Hult International Business School | Universidad Simón Bolívar | Java | JavaScript | C++ | FALSE | FALSE | FALSE | javier.sierra@openai.com, jsierra@openai.com | https://www.linkedin.com/in/javiersierra2102 | https://media.licdn.com/dms/image/C4D03AQHEyUg1kGT08Q/profile-displayphoto-shrink_800_800/0/1516504680512?e=1727913600&v=beta&t=3enCmNDBtJ7LxfbW6j1hDD8qNtHjO2jb2XTONECxUXw | | dougli | Douglas Li | Douglas | Li | London, England, United Kingdom | 37 | @ OpenAI UK, previously at Meta | 583 | 401 | OpenAI | Shift Lab | Facebook | Washington University in St. Louis | | Java | Python | JavaScript | FALSE | TRUE | FALSE | douglas.li@openai.com, dli@openai.com | https://www.linkedin.com/in/dougli | https://media.licdn.com/dms/image/D4E03AQETmRyb3_GB8A/profile-displayphoto-shrink_800_800/0/1687996628597?e=1727913600&v=beta&t=HRYGJ4RxsTMcPF1YcSikXlbz99hx353csho3PWT6fOQ | | nkartashov | Nick Kartashov | Nick | Kartashov | London, England, United Kingdom | 33 | Software Engineer | 2186 | 2182 | OpenAI | Google | DeepMind | St. Petersburg Academic University | Bioinformatics Institute | Teamwork | Java | Haskell | FALSE | FALSE | FALSE | nick.kartashov@openai.com, nkartashov@openai.com | https://www.linkedin.com/in/nkartashov | https://media.licdn.com/dms/image/D4E03AQEjOKxC5UgwWw/profile-displayphoto-shrink_800_800/0/1680706122689?e=1727913600&v=beta&t=m-JnG9nm0zxp1Z7njnInwbCoXyqa3AN-vJZntLfbzQ4 | ### Parameters for `LinkedInAccount()` ```plaintext Optional ├── session_file (str): | file path to save session cookies, so only one manual login is needed. | can use mult profiles this way | | For automated login ├── username (str): | linkedin account email │ ├── password (str): | linkedin account password | ├── driver_type (DriverType): | signs in with the given BrowserType (Chrome, Firefox) and executable_path | ├── solver_service (SolverType): | solves the captcha using the desired service - either CapSolver, or 2Captcha (worse of the two) | ├── solver_api_key (str): | api key for the solver provider │ ├── log_level (int): | Controls the verbosity of the runtime printouts | (0 prints only errors, 1 is info, 2 is all logs. Default is 0.) ``` ### Parameters for `scrape_staff()` ```plaintext Optional ├── company_name (str): | company identifier on linkedin, will search for that company if that company id does not exist | e.g. openai from https://www.linkedin.com/company/openai | ├── search_term (str): | staff title to search for | e.g. software engineer | ├── location (str): | location the staff resides | e.g. london │ ├── extra_profile_data (bool) | fetches educations, experiences, skills, certifications (Default false) │ ├── max_results (int): | number of staff to fetch, default/max is 1000 for a search imposed by LinkedIn | ├── block (bool): | whether to block the user after scraping | ├── connect (bool): | whether to conncet with the user after scraping ``` ### Parameters for `scrape_users()` ```plaintext ├── user_ids (list): | user ids to scrape from | e.g. dougmcmillon from https://www.linkedin.com/in/dougmcmillon | ├── block (bool): | whether to block the user after scraping | ├── connect (bool): | whether to conncet with the user after scraping ``` ### Parameters for `scrape_comments()` ```plaintext ├── post_ids (list): | post ids to scrape from | e.g. 7252381444906364929 from https://www.linkedin.com/posts/williamhgates_technology-transformtheeveryday-activity-7252381444906364929-Bkls ``` ### Parameters for `scrape_companies()` ```plaintext ├── company_names (list): | list of company names to scrape details from | e.g. ['openai', 'microsoft', 'google'] ``` ### Parameters for `scrape_connections()` ```plaintext ├── max_results (int): | maximum number of connections to fetch | ├── extra_profile_data (bool): | fetches educations, experiences, skills, certifications & contact info for each connection (Default false) ``` ### LinkedIn notes - only 1000 max results per search - extra_profile_data increases runtime by O(n) - if rate limited, the program will stop scraping - if using non-browser sign in, turn off 2fa --- ## Frequently Asked Questions --- **Q: Can I get my account banned?** **A:** It is a possibility, although there are no recorded incidents. Let me know if you are the first. However, to protect you, the code does not allow you to run it if LinkedIn is blocking you --- **Q: Scraped 999 staff members, with 869 hidden LinkedIn Members?** **A:** It means your LinkedIn account is bad. Not sure how they classify it but unverified email, new account, low connections and a bunch of factors go into it. --- **Q: How to get around the 1000 search limit result?** **A:** Check the examples folder. We can block the user after searching and try many different locations and search terms to maximize results. --- **Q: Exception: driver not found for selenium?** **A:** You need chromedriver installed (not the chrome): https://googlechromelabs.github.io/chrome-for-testing/#stable --- **Q: Encountering issues with your queries?** **A:** If problems persist, [submit an issue](https://github.com/cullenwatson/StaffSpy/issues). ### Staff Schema ```plaintext Staff ├── Personal Information │ ├── search_term │ ├── id │ ├── name │ ├── first_name │ ├── last_name │ ├── location │ └── bio │ ├── Professional Details │ ├── position │ ├── profile_id │ ├── profile_link │ ├── potential_emails │ └── estimated_age │ ├── Social Connectivity │ ├── followers │ ├── connections │ └── mutuals_count │ ├── Status │ ├── influencer │ ├── creator │ ├── premium │ ├── open_to_work │ ├── is_hiring │ └── is_connection │ ├── Visuals │ ├── profile_photo │ └── banner_photo │ ├── Skills │ ├── name │ └── endorsements │ ├── Experiences │ ├── from_date │ ├── to_date │ ├── duration │ ├── title │ ├── company │ ├── location │ └── emp_type │ ├── Certifications │ ├── title │ ├── issuer │ ├── date_issued │ ├── cert_id │ └── cert_link │ ├── Educational Background | ├── years | ├── school | └── degree │ └── Connection Info (only when a connection and enabled on their profile) ├── email_address ├── address ├── birthday ├── websites ├── phone_numbers └── created_at ``` ================================================ FILE: examples/daily_auto_connect.py ================================================ """ Script to connect with 10 software engineers daily from random tech companies """ from staffspy import LinkedInAccount, DriverType, BrowserType import random import time from datetime import datetime import schedule # List of tech companies to randomly choose from TECH_COMPANIES = [ "microsoft", "google", "apple", "meta", "amazon", "netflix", "salesforce", "adobe", "intel", "nvidia", "oracle", "ibm", "vmware", "twitter", "linkedin", "airbnb", "uber", "stripe", "snowflake", "databricks", ] def connect_with_staff(): print(f"Starting connection run at {datetime.now()}") # Initialize LinkedIn account account = LinkedInAccount(session_file="session.pkl", log_level=1) # Choose a random company company = random.choice(TECH_COMPANIES) print(f"Selected company: {company}") # Connect with 10 users account.scrape_staff( company_name=company, search_term="software engineer", max_results=10, extra_profile_data=True, connect=True, ) if __name__ == "__main__": # Schedule to run once a day at 10 AM schedule.every().day.at("10:00").do(connect_with_staff) # Run immediately on script start connect_with_staff() # Keep the script running while True: schedule.run_pending() time.sleep(60) ================================================ FILE: examples/upload_staff_to_clay.py ================================================ """ Uploads staff to the Clay platform to then further enrich the staff (e.g. waterfall strategy to find their verified emails) """ from staffspy import LinkedInAccount from staffspy.utils.utils import upload_to_clay session_file = "session.pkl" account = LinkedInAccount(session_file=session_file, log_level=2) connections = account.scrape_connections(extra_profile_data=True, max_results=3) clay_webhook_url = ( "https://api.clay.com/v3/sources/webhook/pull-in-data-from-a-webhook-XXXXXXXXXXXXXX" ) upload_to_clay(webhook_url=clay_webhook_url, data=connections) ================================================ FILE: examples/x_corp_staff.py ================================================ """ CASE STUDY: X CORP EMPLOYEES RESULT: We retrieved 1087 profiles. Not as good as expected but still a good result for company that has 2800 employees. final csv - https://drive.google.com/file/d/1aC-GF4RXf9wzGrpxQyGPBxlnLo2X5vm4 Strategies to get around LinkedIn 1000 result limit: 1) It blocks the user after searching to prevent it from appearing in future searches. 2) It tries various searches with department and location to get more results. Lastly, it saves the results in CSV files and then combines them into one DataFrame at the end to view the results. """ import os from datetime import datetime import pandas as pd import glob from staffspy import LinkedInAccount session_file = "session.pkl" account = LinkedInAccount(session_file=str(session_file), log_level=2) departments = [ # Leadership "CEO", "CFO", "CTO", "COO", "executive", "director", "vice president", "head", "lead", # Engineering/Tech "software", "developer", "engineer", "architect", "devops", "QA", "data", "IT", "security", # Business/Operations "sales", "account", "business development", "operations", "project manager", "product manager", # Support Functions "HR", "recruiter", "marketing", "finance", "legal", "accounting", "admin", "support", # Customer-Facing "customer success", "account manager", "sales representative", "customer support", # Specialists "analyst", "consultant", "coordinator", "specialist", ] locations = [ "San Francisco", "New York", "Los Angeles", "Seattle", "Miami", "Boston", "Austin", "Chicago", "Toronto", "London", "Singapore", "Tokyo", "Dublin", ] def save_results(users: pd.DataFrame): output_dir = f"output/{company_name}" os.makedirs(output_dir, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_path = f"{output_dir}/users_{timestamp}.csv" users.to_csv(output_path, index=False) def scrape_and_save(term=None, location=None): users = account.scrape_staff( company_name=company_name, search_term=term, location=location, extra_profile_data=True, max_results=1000, block=True, ) if not users.empty: save_results(users) company_name = "x-corp" # generic search for _ in range(5): scrape_and_save() # Search by departments for department in departments: scrape_and_save(term=department) # Search by locations for location in locations: scrape_and_save(location=location) # load all csvs into one df files = glob.glob("output/x-corp/*.csv") dfs = [pd.read_csv(f) for f in files] combined_df = pd.concat(dfs, ignore_index=True) # Filter out hidden profiles filtered_df = combined_df[combined_df["urn"] != "headless"] filtered_df = filtered_df[filtered_df["current_company"] == "X"] filtered_df = filtered_df.drop_duplicates(subset="id") filtered_urns = len(set(filtered_df["urn"])) print(f"Total unique profiles: {filtered_urns}") company_name = "x-corp" filtered_df.to_csv( f"output/{company_name}/final_result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", index=False, ) ================================================ FILE: pyproject.toml ================================================ [tool.poetry] name = "staffspy" version = "0.2.25" description = "Staff scraper library for LinkedIn" authors = ["Cullen Watson "] readme = "README.md" [tool.poetry.dependencies] python = "^3.10" pydantic = "^2.7.2" pandas = "^2.2.2" requests = "^2.32.3" tldextract = "^5.1.2" selenium = { version = "^4.3.0", optional = true } tenacity = "^8.5.0" python-dateutil = "^2.9.0.post0" beautifulsoup4 = "^4.12.3" 2captcha-python = "^1.2.8" [tool.poetry.extras] browser = ["selenium"] [tool.poetry.group.dev.dependencies] pre-commit = "^3.7.1" black = "^24.4.2" jupyter = "^1.0.0" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" ================================================ FILE: staffspy/__init__.py ================================================ import json import pandas as pd from staffspy.linkedin.comments import CommentFetcher from staffspy.linkedin.linkedin import LinkedInScraper from staffspy.utils.models import Staff from staffspy.solvers.capsolver import CapSolver from staffspy.solvers.solver_type import SolverType from staffspy.solvers.two_captcha import TwoCaptchaSolver from staffspy.utils.utils import ( set_logger_level, logger, Login, parse_company_data, extract_emails_from_text, clean_df, ) from staffspy.utils.driver_type import DriverType, BrowserType __all__ = [ "LinkedInAccount", "SolverType", "DriverType", "BrowserType", ] class LinkedInAccount: """LinkedinAccount storing cookie data and providing outer facing methods for client""" solver_map = { SolverType.CAPSOLVER: CapSolver, SolverType.TWO_CAPTCHA: TwoCaptchaSolver, } def __init__( self, session_file: str = None, username: str = None, password: str = None, log_level: int = 0, solver_api_key: str = None, solver_service: SolverType = SolverType.CAPSOLVER, driver_type: DriverType = None, ): self.session_file = session_file self.username = username self.password = password self.log_level = log_level self.solver = self.solver_map[solver_service](solver_api_key) self.driver_type = driver_type self.session = None self.linkedin_scraper = None self.on_block = False self.login() def login(self): set_logger_level(self.log_level) login = Login( self.username, self.password, self.solver, self.session_file, self.driver_type, ) self.session = login.load_session() def scrape_staff( self, company_name: str = None, search_term: str = None, location: str = None, extra_profile_data: bool = False, max_results: int = 1000, block: bool = False, connect: bool = False, ): if self.on_block: return logger.error( "Account is on cooldown as a safety precaution after receiving a 429 (TooManyRequests) from LinkedIn. Please recreate a new LinkedInAccount to proceed." ) """Main function entry point to scrape LinkedIn staff""" li_scraper = LinkedInScraper(self.session) staff = li_scraper.scrape_staff( company_name=company_name, extra_profile_data=extra_profile_data, search_term=search_term, location=location, max_results=max_results, block=block, connect=connect, ) if li_scraper.on_block: self.on_block = True staff_dicts = [staff.to_dict() for staff in staff] staff_df = pd.DataFrame(staff_dicts) if staff_df.empty: return staff_df staff_df = clean_df(staff_df) linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"] non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"] staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df]) logger.info( f"3) Staff from {company_name}: {len(staff_df)} total, {len(linkedin_member_df)} hidden, {len(staff_df) - len(linkedin_member_df)} visible" ) return staff_df.reset_index(drop=True) def scrape_users( self, user_ids: list[str], block: bool = False, connect: bool = False ) -> pd.DataFrame | None: """Scrape users from Linkedin by user IDs""" if self.on_block: return logger.error( "Account is on cooldown as a safety precaution after receiving a 429 (TooManyRequests) from LinkedIn. Please recreate a new LinkedInAccount to proceed." ) li_scraper = LinkedInScraper(self.session) li_scraper.num_staff = len(user_ids) users = [ Staff( id="", search_term="manual", profile_id=user_id, profile_link=f"https://www.linkedin.com/in/{user_id}", ) for user_id in user_ids ] for i, user in enumerate(users, start=1): user.id, user.urn = li_scraper.fetch_user_profile_data_from_public_id( user.profile_id, "user_id" ) if user.id: li_scraper.fetch_all_info_for_employee(user, i) if block: li_scraper.block_user(user) elif connect: li_scraper.connect_user(user) users_dicts = [user.to_dict() for user in users if user.id] users_df = pd.DataFrame(users_dicts) if users_df.empty: return users_df linkedin_member_df = users_df[users_df["name"] == "LinkedIn Member"] non_linkedin_member_df = users_df[users_df["name"] != "LinkedIn Member"] users_df = pd.concat([non_linkedin_member_df, linkedin_member_df]) logger.info(f"Scraped {len(users_df)} users") return users_df def scrape_comments(self, post_ids: list[str]) -> pd.DataFrame: """Scrape comments from Linkedin by post IDs""" if self.on_block: return logger.error( "Account is on cooldown as a safety precaution after receiving a 429 (TooManyRequests) from LinkedIn. Please recreate a new LinkedInAccount to proceed." ) comment_fetcher = CommentFetcher(self.session) all_comments = [] for i, post_id in enumerate(post_ids, start=1): comments = comment_fetcher.fetch_comments(post_id) all_comments.extend(comments) comment_dict = [comment.to_dict() for comment in all_comments] comment_df = pd.DataFrame(comment_dict) if not comment_df.empty: comment_df["emails"] = comment_df["text"].apply(extract_emails_from_text) comment_df = comment_df.sort_values(by="created_at", ascending=False) return comment_df def scrape_companies( self, company_names: list[str] = None, ) -> pd.DataFrame: """Scrape company details from Linkedin""" if self.on_block: return logger.error( "Account is on cooldown as a safety precaution after receiving a 429 (TooManyRequests) from LinkedIn. Please recreate a new LinkedInAccount to proceed." ) if not company_names: raise ValueError("company_names list cannot be empty") li_scraper = LinkedInScraper(self.session) company_dfs = [] for company_name in company_names: try: company_res = li_scraper.fetch_or_search_company(company_name) try: company_data = company_res.json() except json.decoder.JSONDecodeError: logger.error(f"Failed to fetch company data for {company_name}") continue company_df = parse_company_data(company_data, search_term=company_name) company_dfs.append(company_df) except Exception as e: logger.error(f"Failed to process company {company_name}: {str(e)}") continue if not company_dfs: return pd.DataFrame() return pd.concat(company_dfs, ignore_index=True) def scrape_connections( self, max_results: int = 10**8, extra_profile_data: bool = False, ) -> pd.DataFrame: """Scrape connections from Linkedin""" if self.on_block: return logger.error( "Account is on cooldown as a safety precaution after receiving a 429 (TooManyRequests) from LinkedIn. Please recreate a new LinkedInAccount to proceed." ) li_scraper = LinkedInScraper(self.session) connections = li_scraper.scrape_connections( max_results=max_results, extra_profile_data=extra_profile_data, ) connections_df = pd.DataFrame() if connections: staff_dicts = [staff.to_dict() for staff in connections] connections_df = pd.DataFrame(staff_dicts) connections_df = clean_df(connections_df) return connections_df ================================================ FILE: staffspy/linkedin/certifications.py ================================================ import json import logging from staffspy.utils.exceptions import TooManyRequests from staffspy.utils.models import Certification logger = logging.getLogger(__name__) class CertificationFetcher: def __init__(self, session): self.session = session self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerIdentityDashProfileComponents.277ba7d7b9afffb04683953cede751fb&queryName=ProfileComponentsBySectionType&variables=(tabIndex:0,sectionType:certifications,profileUrn:urn%3Ali%3Afsd_profile%3A{employee_id},count:50)" def fetch_certifications(self, staff): ep = self.endpoint.format(employee_id=staff.id) res = self.session.get(ep) logger.debug(f"certs, status code - {res.status_code}") if res.status_code == 429: raise TooManyRequests("429 Too Many Requests") if not res.ok: logger.debug(res.text[:200]) return False try: res_json = res.json() except json.decoder.JSONDecodeError: logger.debug(res.text[:200]) return False try: elems = res_json["data"]["identityDashProfileComponentsBySectionType"][ "elements" ] except (KeyError, IndexError, TypeError) as e: logger.debug(res_json) return False if elems: cert_elems = elems[0]["components"]["pagedListComponent"]["components"][ "elements" ] staff.certifications = self.parse_certifications(cert_elems) return True def parse_certifications(self, sections): certs = [] for section in sections: elem = section["components"]["entityComponent"] if not elem: break title = elem["titleV2"]["text"]["text"] issuer = elem["subtitle"]["text"] if elem["subtitle"] else None date_issued = ( elem["caption"]["text"].replace("Issued ", "") if elem["caption"] else None ) cert_id = ( elem["metadata"]["text"].replace("Credential ID ", "") if elem["metadata"] else None ) try: subcomp = elem["subComponents"]["components"][0] cert_link = subcomp["components"]["actionComponent"]["action"][ "navigationAction" ]["actionTarget"] except: cert_link = None cert = Certification( title=title, issuer=issuer, date_issued=date_issued, cert_link=cert_link, cert_id=cert_id, ) certs.append(cert) return certs ================================================ FILE: staffspy/linkedin/comments.py ================================================ import json import re from datetime import datetime as dt from staffspy.utils.exceptions import TooManyRequests from staffspy.utils.models import Comment from staffspy.utils.utils import logger class CommentFetcher: def __init__(self, session): self.session = session self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSocialDashComments.8cb29aedde780600a7ad17fc7ebb8277&queryName=SocialDashCommentsBySocialDetail&variables=(origins:List(),count:100,socialDetailUrn:urn%3Ali%3Afsd_socialDetail%3A%28urn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3Aactivity%3A7254884361622208512%2Curn%3Ali%3AhighlightedReply%3A-%29,sortOrder:REVERSE_CHRONOLOGICAL,start:{start})" self.post_id = None self.num_commments = 100 def fetch_comments(self, post_id: str): all_comments = [] self.post_id = post_id for i in range(0, 200_000, self.num_commments): logger.info(f"Fetching comments for post {post_id}, start {i}") ep = self.endpoint.format(post_id=post_id, start=i) res = self.session.get(ep) logger.debug(f"comments info, status code - {res.status_code}") if res.status_code == 429: raise TooManyRequests("429 Too Many Requests") if not res.ok: logger.debug(res.text[:200]) return False try: comments_json = res.json() except json.decoder.JSONDecodeError: logger.debug(res.text[:200]) return False comments, num_results = self.parse_comments(comments_json) all_comments.extend(comments) if not num_results: break return all_comments def parse_comments(self, comments_json: dict): """Parse the comment data from the employee profile.""" comments = [] for element in ( results := comments_json.get("data", {}) .get("socialDashCommentsBySocialDetail", {}) .get("elements", []) ): internal_profile_id = (commenter := element["commenter"])[ "commenterProfileId" ] name = commenter["title"]["text"] linkedin_id_match = re.search("/in/(.+)", commenter["navigationUrl"]) linkedin_id = linkedin_id_match.group(1) if linkedin_id_match else None commentary = element.get("commentary", {}).get("text", "") comment_id = element["urn"].split(",")[-1].rstrip(")") num_likes = element["socialDetail"]["totalSocialActivityCounts"]["numLikes"] comment = Comment( post_id=self.post_id, comment_id=comment_id, internal_profile_id=internal_profile_id, public_profile_id=linkedin_id, name=name, text=commentary, num_likes=num_likes, created_at=dt.utcfromtimestamp(element["createdAt"] / 1000), ) comments.append(comment) return comments, len(results) ================================================ FILE: staffspy/linkedin/contact_info.py ================================================ from calendar import month_name from datetime import datetime import json import requests import logging import pytz from staffspy.utils.exceptions import TooManyRequests from staffspy.utils.models import ContactInfo, Staff logger = logging.getLogger(__name__) class ContactInfoFetcher: def __init__(self, session): self.session = session self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerIdentityDashProfiles.13618f886ce95bf503079f49245fbd6f&queryName=ProfilesByMemberIdentity&variables=(memberIdentity:{employee_id},count:1)" def fetch_contact_info(self, base_staff): ep = self.endpoint.format(employee_id=base_staff.id) try: res = self.session.get(ep) except requests.exceptions.TooManyRedirects as e: logger.error("Too many redirects encountered: %s", e) return None logger.debug(f"bio info, status code - {res.status_code}") if res.status_code == 429: return TooManyRequests("429 Too Many Requests") if not res.ok: logger.debug(res.text) return False try: employee_json = res.json() except json.decoder.JSONDecodeError: logger.debug(res.text) return False self.parse_emp_contact_info(base_staff, employee_json) return True def parse_emp_contact_info(self, emp: Staff, emp_dict: dict): """Parse the employee data from the employee profile.""" contact_info = ContactInfo() emp_dict = emp_dict["data"]["identityDashProfilesByMemberIdentity"]["elements"][ 0 ] try: contact_info.email_address = emp_dict["emailAddress"]["emailAddress"] except (KeyError, IndexError, TypeError): pass try: contact_info.address = emp_dict["address"] except (KeyError, IndexError, TypeError): pass try: month = month_name[emp_dict["birthDateOn"]["month"]] day = emp_dict["birthDateOn"]["day"] contact_info.birthday = f"{month} {day}" except (KeyError, IndexError, TypeError): pass try: contact_info.websites = [x["url"] for x in emp_dict["websites"]] except (KeyError, IndexError, TypeError): pass try: contact_info.phone_numbers = [ x["phoneNumber"]["number"] for x in emp_dict["phoneNumbers"] ] except (KeyError, IndexError, TypeError): pass try: created_at = emp_dict["memberRelationship"][ "memberRelationshipDataResolutionResult" ]["connection"]["createdAt"] timezone = pytz.timezone("UTC") dt = datetime.fromtimestamp(created_at / 1000, tz=timezone) contact_info.created_at = dt.strftime("%Y-%m-%d %H:%M:%S %Z") except (KeyError, IndexError, TypeError): pass emp.contact_info = contact_info ================================================ FILE: staffspy/linkedin/employee.py ================================================ import json import logging import re import staffspy.utils.utils as utils from staffspy.utils.exceptions import TooManyRequests from staffspy.utils.models import Staff logger = logging.getLogger(__name__) class EmployeeFetcher: def __init__(self, session): self.session = session self.endpoint = "https://www.linkedin.com/voyager/api/voyagerIdentityDashProfiles?count=1&decorationId=com.linkedin.voyager.dash.deco.identity.profile.TopCardComplete-138&memberIdentity={employee_id}&q=memberIdentity" self.domain = None def fetch_employee(self, base_staff, domain): self.domain = domain ep = self.endpoint.format(employee_id=base_staff.id) res = self.session.get(ep) logger.debug(f"basic info, status code - {res.status_code}") if res.status_code == 429: return TooManyRequests("429 Too Many Requests") if not res.ok: logger.debug(res.text[:200]) return False try: res_json = res.json() except json.decoder.JSONDecodeError: logger.debug(res.text[:200]) return False try: employee_json = res_json["elements"][0] except (KeyError, IndexError, TypeError): logger.debug(res_json) return False self.parse_emp(base_staff, employee_json) return True def parse_emp(self, emp: Staff, emp_dict: dict): """Parse the employee data from the employee profile.""" def get_photo_url(emp_dict: dict, key: str): try: photo_data = emp_dict[key]["displayImageReference"]["vectorImage"] photo_base_url = photo_data["rootUrl"] photo_ext_url = photo_data["artifacts"][-1][ "fileIdentifyingUrlPathSegment" ] return f"{photo_base_url}{photo_ext_url}" except (KeyError, TypeError, IndexError, ValueError): return None emp.profile_photo = get_photo_url(emp_dict, "profilePicture") emp.banner_photo = get_photo_url(emp_dict, "backgroundPicture") emp.profile_id = emp_dict["publicIdentifier"] try: emp.headline = emp_dict.get("headline") if not emp.headline: emp.headline = emp_dict["memberRelationship"]["memberRelationshipData"][ "noInvitation" ]["targetInviteeResolutionResult"]["headline"] except: pass union_type = next( iter(emp_dict["memberRelationship"]["memberRelationshipUnion"]) ) emp.is_connection = "no" if union_type == "connection": emp.is_connection = "yes" elif union_type == "noConnection": invitation = ( emp_dict["memberRelationship"]["memberRelationshipUnion"][ "noConnection" ] .get("invitationUnion", {}) .get("invitation", {}) ) if invitation and invitation.get("invitationState") == "PENDING": emp.is_connection = "pending" emp.open_to_work = emp_dict["profilePicture"].get("frameType") == "OPEN_TO_WORK" emp.is_hiring = emp_dict["profilePicture"].get("frameType") == "HIRING" emp.first_name = emp_dict["firstName"] emp.last_name = emp_dict["lastName"].split(",")[0] if not emp.name: name = filter(None, [emp.first_name, emp.last_name]) emp.name = " ".join(name) emp.potential_emails = ( utils.create_emails(emp.first_name, emp.last_name, self.domain) if self.domain else None ) emp.followers = emp_dict.get("followingState", {}).get("followerCount") emp.connections = emp_dict["connections"]["paging"]["total"] emp.location = ( emp_dict.get("geoLocation", {}).get("geo", {}).get("defaultLocalizedName") ) # Handle empty elements case for company top_positions = emp_dict.get("profileTopPosition", {}).get("elements", []) if top_positions: emp.company = top_positions[0].get("companyName", None) else: emp.company = None edu_cards = emp_dict.get("profileTopEducation", {}).get("elements", []) if edu_cards: emp.school = edu_cards[0].get( "schoolName", edu_cards[0].get("school", {}).get("name") ) emp.influencer = emp_dict.get("influencer", False) emp.creator = emp_dict.get("creator", False) emp.premium = emp_dict.get("premium", False) emp.mutual_connections = 0 try: profile_insight = emp_dict.get("profileInsight", {}).get("elements", []) if profile_insight: mutual_connections_str = profile_insight[0]["text"]["text"] match = re.search(r"\d+", mutual_connections_str) if match: emp.mutual_connections = int(match.group()) + 2 else: emp.mutual_connections = ( 2 if " and " in mutual_connections_str else 1 ) except (KeyError, TypeError, IndexError, ValueError) as e: pass ================================================ FILE: staffspy/linkedin/employee_bio.py ================================================ import json import logging from staffspy.utils.exceptions import TooManyRequests logger = logging.getLogger(__name__) class EmployeeBioFetcher: def __init__(self, session): self.session = session self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerIdentityDashProfileCards.9ad2590cb61a073ad514922fa752f566&queryName=ProfileTabInitialCards&variables=(count:50,profileUrn:urn%3Ali%3Afsd_profile%3A{employee_id})" def fetch_employee_bio(self, base_staff): ep = self.endpoint.format(employee_id=base_staff.id) res = self.session.get(ep) logger.debug(f"bio info, status code - {res.status_code}") if res.status_code == 429: return TooManyRequests("429 Too Many Requests") if not res.ok: logger.debug(res.text) return False try: data = res.json() except json.decoder.JSONDecodeError: logger.debug(res.text) return False try: base_staff.bio = data["data"]["identityDashProfileCardsByInitialCards"][ "elements" ][3]["topComponents"][1]["components"]["textComponent"]["text"]["text"] except (KeyError, IndexError, TypeError): return False return True ================================================ FILE: staffspy/linkedin/experiences.py ================================================ import json import logging import staffspy.utils.utils as utils from staffspy.utils.exceptions import TooManyRequests from staffspy.utils.models import Experience logger = logging.getLogger(__name__) class ExperiencesFetcher: def __init__(self, session): self.session = session self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerIdentityDashProfileComponents.277ba7d7b9afffb04683953cede751fb&queryName=ProfileComponentsBySectionType&variables=(tabIndex:0,sectionType:experience,profileUrn:urn%3Ali%3Afsd_profile%3A{employee_id},count:50)" def fetch_experiences(self, staff): ep = self.endpoint.format(employee_id=staff.id) res = self.session.get(ep) logger.debug(f"exps, status code - {res.status_code}") if res.reason == "INKApi Error": raise Exception( "Delete session file and log in again", res.status_code, res.text[:200], res.reason, ) elif res.status_code == 429: return TooManyRequests("429 Too Many Requests") elif not res.ok: logger.debug(res.text[:200]) return False try: res_json = res.json() except json.decoder.JSONDecodeError: logger.debug(res.text[:200]) return False try: skills_json = res_json["data"][ "identityDashProfileComponentsBySectionType" ]["elements"][0]["components"]["pagedListComponent"]["components"][ "elements" ] except (KeyError, IndexError, TypeError) as e: logger.debug(res_json) return False staff.experiences = self.parse_experiences(skills_json) return True def parse_experiences(self, elements): exps = [] for elem in elements: try: components = elem.get("components") if components is None: continue entity = components.get("entityComponent") if entity is None: continue sub_components = entity.get("subComponents") if ( sub_components is None or len(sub_components.get("components", [])) == 0 or sub_components["components"][0].get("components") is None or sub_components["components"][0]["components"].get( "pagedListComponent" ) is None ): emp_type = start_date = end_date = None caption = entity.get("caption") duration = caption.get("text") if caption else None if duration: start_date, end_date = utils.parse_dates(duration) from_date, to_date = utils.parse_duration(duration) if from_date: duration_parts = duration.split(" · ") if len(duration_parts) > 1: duration = duration_parts[1] subtitle = entity.get("subtitle") company = subtitle.get("text") if subtitle else None titleV2 = entity.get("titleV2") title_text = titleV2.get("text") if titleV2 else None title = title_text.get("text") if title_text else None metadata = entity.get("metadata") location = metadata.get("text") if metadata else None if company: parts = company.split(" · ") if len(parts) > 1: company = parts[0] emp_type = parts[-1].lower() exp = Experience( duration=duration, title=title, company=company, emp_type=emp_type, start_date=start_date, end_date=end_date, location=location, ) exps.append(exp) else: multi_exps = self.parse_multi_exp(entity) exps += multi_exps except Exception as e: logger.exception(e) return exps def parse_multi_exp(self, entity): exps = [] company = entity["titleV2"]["text"]["text"] elements = entity["subComponents"]["components"][0]["components"][ "pagedListComponent" ]["components"]["elements"] for elem in elements: entity = elem["components"]["entityComponent"] duration = entity["caption"]["text"] title = entity["titleV2"]["text"]["text"] emp_type = ( entity["subtitle"]["text"].lower() if entity["subtitle"] else None ) location = entity["metadata"]["text"] if entity["metadata"] else None start_date, end_date = utils.parse_dates(duration) from_date, to_date = utils.parse_duration(duration) if from_date: duration = duration.split(" · ")[1] exp = Experience( duration=duration, title=title, company=company, emp_type=emp_type, start_date=start_date, end_date=end_date, location=location, ) exps.append(exp) return exps ================================================ FILE: staffspy/linkedin/languages.py ================================================ import json import logging from staffspy.utils.exceptions import TooManyRequests from staffspy.utils.models import Skill, Staff logger = logging.getLogger(__name__) class LanguagesFetcher: def __init__(self, session): self.session = session self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerIdentityDashProfileComponents.9117695ef207012719e3e0681c667e14&queryName=ProfileComponentsBySectionType&variables=(tabIndex:0,sectionType:languages,profileUrn:urn%3Ali%3Afsd_profile%3A{employee_id},count:50)" def fetch_languages(self, staff: Staff): ep = self.endpoint.format(employee_id=staff.id) res = self.session.get(ep) logger.debug(f"skills, status code - {res.status_code}") if res.status_code == 429: return TooManyRequests("429 Too Many Requests") if not res.ok: logger.debug(res.text) return False try: res_json = res.json() except json.decoder.JSONDecodeError: logger.debug(res.text) return False if res_json.get("errors"): return False staff.languages = self.parse_languages(res_json) return True def parse_languages(self, language_json: dict) -> list[str]: languages = [] elements = language_json["data"]["identityDashProfileComponentsBySectionType"][ "elements" ][0]["components"]["pagedListComponent"]["components"]["elements"] for element in elements: if comp := element["components"]["entityComponent"]: title = comp["titleV2"]["text"]["text"] languages.append(title) return languages ================================================ FILE: staffspy/linkedin/linkedin.py ================================================ """ staffspy.linkedin.linkedin ~~~~~~~~~~~~~~~~~~~ This module contains routines to scrape LinkedIn. """ import json import re from concurrent.futures import ThreadPoolExecutor, as_completed from urllib.parse import quote, unquote import requests import staffspy.utils.utils as utils from staffspy.utils.exceptions import TooManyRequests, BadCookies, GeoUrnNotFound from staffspy.linkedin.contact_info import ContactInfoFetcher from staffspy.linkedin.certifications import CertificationFetcher from staffspy.linkedin.employee import EmployeeFetcher from staffspy.linkedin.employee_bio import EmployeeBioFetcher from staffspy.linkedin.experiences import ExperiencesFetcher from staffspy.linkedin.languages import LanguagesFetcher from staffspy.linkedin.schools import SchoolsFetcher from staffspy.linkedin.skills import SkillsFetcher from staffspy.utils.models import Staff from staffspy.utils.utils import logger class LinkedInScraper: employees_ep = "https://www.linkedin.com/voyager/api/graphql?variables=(start:{offset},query:(flagshipSearchIntent:SEARCH_SRP,{search}queryParameters:List({company_id}{location}(key:resultType,value:List(PEOPLE))),includeFiltersInResponse:false),count:{count})&queryId=voyagerSearchDashClusters.66adc6056cf4138949ca5dcb31bb1749" company_id_ep = "https://www.linkedin.com/voyager/api/organization/companies?q=universalName&universalName=" company_search_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSearchDashClusters.02af3bc8bc85a169bb76bb4805d05759&queryName=SearchClusterCollection&variables=(query:(flagshipSearchIntent:SEARCH_SRP,keywords:{company},includeFiltersInResponse:false,queryParameters:(keywords:List({company}),resultType:List(COMPANIES))),count:10,origin:GLOBAL_SEARCH_HEADER,start:0)" location_id_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSearchDashReusableTypeahead.57a4fa1dd92d3266ed968fdbab2d7bf5&queryName=SearchReusableTypeaheadByType&variables=(query:(showFullLastNameForConnections:false,typeaheadFilterQuery:(geoSearchTypes:List(MARKET_AREA,COUNTRY_REGION,ADMIN_DIVISION_1,CITY))),keywords:{location},type:GEO,start:0)" public_user_id_ep = ( "https://www.linkedin.com/voyager/api/identity/profiles/{user_id}/profileView" ) connections_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSearchDashClusters.dfcd3603c2779eddd541f572936f4324&queryName=SearchClusterCollection&variables=(query:(queryParameters:(resultType:List(FOLLOWERS)),flagshipSearchIntent:MYNETWORK_CURATION_HUB,includeFiltersInResponse:true),count:50,origin:CurationHub,start:{offset})" block_user_ep = "https://www.linkedin.com/voyager/api/voyagerTrustDashContentReportingForm?action=entityBlock" connect_to_user_ep = "https://www.linkedin.com/voyager/api/voyagerRelationshipsDashMemberRelationships?action=verifyQuotaAndCreateV2&decorationId=com.linkedin.voyager.dash.deco.relationships.InvitationCreationResultWithInvitee-1" def __init__(self, session: requests.Session): self.session = session ( self.company_id, self.staff_count, self.num_staff, self.company_name, self.domain, self.max_results, self.search_term, self.location, self.raw_location, ) = (None, None, None, None, None, None, None, None, None) self.on_block = False self.connect_block = False self.certs = CertificationFetcher(self.session) self.skills = SkillsFetcher(self.session) self.employees = EmployeeFetcher(self.session) self.schools = SchoolsFetcher(self.session) self.experiences = ExperiencesFetcher(self.session) self.bio = EmployeeBioFetcher(self.session) self.languages = LanguagesFetcher(self.session) self.contact = ContactInfoFetcher(self.session) def search_companies(self, company_name: str): """Get the company id and staff count from the company name.""" company_search_ep = self.company_search_ep.format(company=quote(company_name)) self.session.headers["x-li-graphql-pegasus-client"] = "true" res = self.session.get(company_search_ep) self.session.headers.pop("x-li-graphql-pegasus-client", "") if not res.ok: raise Exception( f"Failed to search for company {company_name}", res.status_code, res.text[:200], ) logger.debug( f"Searched companies for name '{company_name}' - res code {res.status_code}-" ) companies = res.json()["data"]["searchDashClustersByAll"]["elements"] err_msg = f"No companies found for name {company_name}" if len(companies) < 2: raise Exception(err_msg) try: num_results = companies[0]["items"][0]["item"]["simpleTextV2"]["text"][ "text" ] first_company = companies[1]["items"][0]["item"].get("entityResult") if not first_company and len(companies) > 2: first_company = companies[2]["items"][0]["item"].get("entityResult") if not first_company: raise Exception(err_msg) company_link = first_company["navigationUrl"] company_name_id = unquote( re.search(r"/company/([^/]+)", company_link).group(1) ) company_name_new = first_company["title"]["text"] except Exception as e: raise Exception( f"Failed to load json in search_companies {str(e)}, Response: {res.text[:200]}" ) logger.info( f"Searched company {company_name} on LinkedIn and were {num_results}, using first result with company name - '{company_name_new}' and company id - '{company_name_id}'" ) return company_name_id def fetch_or_search_company(self, company_name): """Fetch the company details by name, or search if not found.""" res = self.session.get(f"{self.company_id_ep}{company_name}") if res.status_code not in (200, 404): raise Exception( f"Failed to find company {company_name} (likely due to outdated login if you know it's valid company)", res.status_code, res.text[:200], ) elif res.status_code == 404: logger.info( f"Failed to directly use company '{company_name}' as company id, now searching for the company" ) company_name = self.search_companies(company_name) res = self.session.get(f"{self.company_id_ep}{company_name}") if res.status_code != 200: raise Exception( f"Failed to find company after performing a direct and generic search for {company_name}", res.status_code, res.text[:200], ) if not res.ok: logger.debug(f"res code {res.status_code} - fetched company ") return res def _get_company_id_and_staff_count(self, company_name: str): """Extract company id and staff count from the company details.""" res = self.fetch_or_search_company(company_name) try: response_json = res.json() except json.decoder.JSONDecodeError: logger.debug(res.text[:200]) raise Exception( f"Failed to load json in get_company_id_and_staff_count {res.text[:200]}" ) company = response_json["elements"][0] self.domain = ( utils.extract_base_domain(company["companyPageUrl"]) if company.get("companyPageUrl") else None ) staff_count = company["staffCount"] company_id = company["trackingInfo"]["objectUrn"].split(":")[-1] company_name = company["universalName"] logger.info(f"Found company '{company_name}' with {staff_count} staff") return company_id, staff_count def parse_staff(self, elements: list[dict]): """Parse the staff from the search results""" staff = [] for elem in elements: for card in elem.get("items", []): person = card.get("item", {}).get("entityResult", {}) if not person: continue pattern = ( r"urn:li:fsd_profile:([^,]+),(?:SEARCH_SRP|MYNETWORK_CURATION_HUB)" ) match = re.search(pattern, person["entityUrn"]) linkedin_id = match.group(1) if match else None person_urn = person["trackingUrn"].split(":")[-1] name = person["title"]["text"].strip() headline = ( person.get("primarySubtitle", {}).get("text", "") if person.get("primarySubtitle") else "" ) profile_link = person["navigationUrl"].split("?")[0] staff.append( Staff( urn=person_urn, id=linkedin_id, name=name, headline=headline, search_term=" - ".join( filter( None, [ self.company_name, self.search_term, self.raw_location, ], ) ), profile_link=profile_link, ) ) return staff def fetch_staff(self, offset: int): """Fetch the staff using LinkedIn search""" ep = self.employees_ep.format( offset=offset, company_id=( f"(key:currentCompany,value:List({self.company_id}))," if self.company_id else "" ), count=50, search=f"keywords:{quote(self.search_term)}," if self.search_term else "", location=( f"(key:geoUrn,value:List({self.location}))," if self.location else "" ), ) res = self.session.get(ep) if not res.ok: logger.debug(f"employees, status code - {res.status_code}") if res.status_code == 400: raise BadCookies("Outdated login, delete the session file to log in again") elif res.status_code == 429: raise TooManyRequests("429 Too Many Requests") if not res.ok: return None, 0 try: res_json = res.json() except json.decoder.JSONDecodeError: logger.debug(res.text) return None, 0 try: elements = res_json["data"]["searchDashClustersByAll"]["elements"] total_count = res_json["data"]["searchDashClustersByAll"]["metadata"][ "totalResultCount" ] except (KeyError, IndexError, TypeError): logger.debug(res_json) return None, 0 new_staff = self.parse_staff(elements) if elements else [] return new_staff, total_count def fetch_connections_page(self, offset: int): self.session.headers["x-li-graphql-pegasus-client"] = "true" res = self.session.get(self.connections_ep.format(offset=offset)) self.session.headers.pop("x-li-graphql-pegasus-client", "") if not res.ok: logger.debug(f"employees, status code - {res.status_code}") if res.status_code == 400: raise BadCookies("Outdated login, delete the session file to log in again") elif res.status_code == 429: raise TooManyRequests("429 Too Many Requests") if not res.ok: return try: res_json = res.json() except json.decoder.JSONDecodeError: logger.debug(res.text) return try: elements = res_json["data"]["searchDashClustersByAll"]["elements"] total_count = res_json["data"]["searchDashClustersByAll"]["metadata"][ "totalResultCount" ] except (KeyError, IndexError, TypeError): logger.debug(res_json) return new_staff = self.parse_staff(elements) if elements else [] return new_staff, total_count def scrape_connections( self, max_results: int = 10**8, extra_profile_data: bool = False, ): self.search_term = "connections" staff_list: list[Staff] = [] try: initial_staff, total_search_result_count = self.fetch_connections_page(0) if initial_staff: staff_list.extend(initial_staff) self.num_staff = min(total_search_result_count, max_results) for offset in range(50, self.num_staff, 50): staff, _ = self.fetch_connections_page(offset) logger.debug( f"Connections from search: {len(staff)} new, {len(staff_list) + len(staff)} total" ) if not staff: break staff_list.extend(staff) except (BadCookies, TooManyRequests) as e: self.on_block = True logger.error(f"Exiting early due to fatal error: {str(e)}") return staff_list[:max_results] reduced_staff_list = staff_list[:max_results] non_restricted = list( filter(lambda x: x.name != "LinkedIn Member", reduced_staff_list) ) if extra_profile_data: try: for i, employee in enumerate(non_restricted, start=1): self.fetch_all_info_for_employee(employee, i) except TooManyRequests as e: logger.error(str(e)) return reduced_staff_list def fetch_location_id(self): """Fetch the location id for the location to be used in LinkedIn search""" ep = self.location_id_ep.format(location=quote(self.raw_location)) res = self.session.get(ep) try: res_json = res.json() except json.decoder.JSONDecodeError: if res.reason == "INKApi Error": raise Exception( "Delete session file and log in again", res.status_code, res.text[:200], res.reason, ) raise GeoUrnNotFound( "Failed to send request to get geo id", res.status_code, res.text[:200], res.reason, ) try: elems = res_json["data"]["searchDashReusableTypeaheadByType"]["elements"] except (KeyError, IndexError, TypeError): raise GeoUrnNotFound("Failed to locate geo id", res_json[:200]) geo_id = None if elems: urn = elems[0]["trackingUrn"] m = re.search("urn:li:geo:(.+)", urn) if m: geo_id = m.group(1) if not geo_id: raise GeoUrnNotFound("Failed to parse geo id") self.location = geo_id def scrape_staff( self, company_name: str | None, search_term: str, location: str, extra_profile_data: bool, max_results: int, block: bool, connect: bool, ): """Main function entry point to scrape LinkedIn staff""" self.search_term = search_term self.company_name = company_name self.max_results = max_results self.raw_location = location self.company_id = None if self.company_name: self.company_id, staff_count = self._get_company_id_and_staff_count( company_name ) staff_list: list[Staff] = [] if self.raw_location: try: self.fetch_location_id() except GeoUrnNotFound as e: logger.error(str(e)) return staff_list[:max_results] try: initial_staff, total_count = self.fetch_staff(0) if initial_staff: staff_list.extend(initial_staff) location = f", location: '{location}'" if location else "" logger.info( f"1) Search results for company: '{company_name}'{location} - {total_count:,} staff" ) self.num_staff = min(total_count, max_results, 1000) for offset in range(50, self.num_staff, 50): staff, _ = self.fetch_staff(offset) logger.debug( f"Staff members from search: {len(staff)} new, {len(staff_list) + len(staff)} total" ) if not staff: break staff_list.extend(staff) location = f", location: '{location}'" if location else "" logger.info( f"2) Total results collected for company: '{company_name}'{location} - {len(staff_list)} results" ) except (BadCookies, TooManyRequests) as e: self.on_block = True logger.error(f"Exiting early due to fatal error: {str(e)}") return staff_list[:max_results] reduced_staff_list = staff_list[:max_results] non_restricted = list( filter(lambda x: x.name != "LinkedIn Member", reduced_staff_list) ) if extra_profile_data: try: for i, employee in enumerate(non_restricted, start=1): self.fetch_all_info_for_employee(employee, i) if block: self.block_user(employee) elif connect: self.connect_user(employee) except TooManyRequests as e: logger.error(str(e)) return reduced_staff_list def fetch_all_info_for_employee(self, employee: Staff, index: int): """Simultaniously fetch all the data for an employee""" logger.info( f"Fetching data for account {employee.id} {index:>4} / {self.num_staff} - {employee.profile_link}" ) task_functions = [ (self.employees.fetch_employee, (employee, self.domain), "employee"), (self.skills.fetch_skills, (employee,), "skills"), (self.experiences.fetch_experiences, (employee,), "experiences"), (self.certs.fetch_certifications, (employee,), "certifications"), (self.schools.fetch_schools, (employee,), "schools"), (self.bio.fetch_employee_bio, (employee,), "bio"), (self.languages.fetch_languages, (employee,), "languages"), ] with ThreadPoolExecutor(max_workers=len(task_functions)) as executor: tasks = { executor.submit(func, *args): name for func, args, name in task_functions } for future in as_completed(tasks): result = future.result() if employee.is_connection: self.contact.fetch_contact_info(employee) def fetch_user_profile_data_from_public_id(self, user_id: str, key: str): """Fetches data given the public LinkedIn user id""" endpoint = self.public_user_id_ep.format(user_id=user_id) response = self.session.get(endpoint) try: response_json = response.json() except json.decoder.JSONDecodeError: logger.debug(response.text[:200]) raise Exception( f"Failed to load JSON from endpoint", response.status_code, response.reason, ) keys = { "user_id": ("positionView", "profileId"), "company_id": ( "positionView", "elements", 0, "company", "miniCompany", "universalName", ), } try: data = response_json for k in keys[key]: data = data[k] urn = response_json["profile"]["miniProfile"]["objectUrn"].split(":")[-1] return data, urn except (KeyError, TypeError, IndexError) as e: logger.warning(f"Failed to find user_id {user_id}") if key == "user_id": return "" raise Exception(f"Failed to fetch '{key}' for user_id {user_id}: {e}") def block_user(self, employee: Staff) -> None: """Block a user on LinkedIn given their urn""" if employee.urn == "headless": return self.session.headers["Content-Type"] = ( "application/x-protobuf2; symbol-table=voyager-20757" ) urn_string = f"urn:li:member:{employee.urn}" length_byte = bytes([len(urn_string)]) body = b"\x00\x01\x14\nblockeeUrn\x14" + length_byte + urn_string.encode() res = self.session.post( self.block_user_ep, data=body, ) self.session.headers.pop("Content-Type", "") if res.ok: logger.info(f"Successfully blocked user {employee.id}") elif res.status_code == 403: logger.warning( f"Failed to block user - status code 403, one possible reason is you have alread blocked/unblocked this person in past 48 hours and on cooldown: {employee.profile_link}" ) else: logger.warning( f"Failed to block user - status code {res.status_code} {employee.id}: {employee.name}" ) def connect_user(self, employee: Staff) -> None: """Connects with a user on LinkedIn given their profile id""" if self.connect_block: return logger.info( f"Skipping connection request for user due to previou block: {employee.id} - {employee.profile_link} " ) if employee.urn == "headless": return if employee.is_connection != "no": return logger.info( f"Already connected or pending connection request to user {employee.id} - {employee.profile_link}" ) self.session.headers["Content-Type"] = ( "application/x-protobuf2; symbol-table=voyager-20757" ) body = ( b"\x00\x01\x03\xe2\x05\x00\x01\x03\xd3w\x00\x01\x03\xd5\x06\x14:urn:li:fsd_profile:" + employee.id.encode() ) res = self.session.post( self.connect_to_user_ep, data=body, ) self.session.headers.pop("Content-Type", "") if res.ok: logger.info( f"Successfully sent connection request to user {employee.id} - {employee.profile_link}" ) elif res.status_code == 429: self.connect_block = True logger.warning( f"Failed to connect to user - status code 429 - pausing connection requests for this scrape: {employee.id} - {employee.profile_link}" ) else: logger.warning( f"Failed to connect to user - status code {res.status_code} {employee.id} -{employee.profile_link}" ) ================================================ FILE: staffspy/linkedin/schools.py ================================================ import json import logging from staffspy.utils.exceptions import TooManyRequests from staffspy.utils.models import School from staffspy.utils.utils import parse_dates logger = logging.getLogger(__name__) class SchoolsFetcher: def __init__(self, session): self.session = session self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerIdentityDashProfileComponents.277ba7d7b9afffb04683953cede751fb&queryName=ProfileComponentsBySectionType&variables=(tabIndex:0,sectionType:education,profileUrn:urn%3Ali%3Afsd_profile%3A{employee_id},count:50)" def fetch_schools(self, staff): ep = self.endpoint.format(employee_id=staff.id) res = self.session.get(ep) logger.debug(f"schools, status code - {res.status_code}") if res.status_code == 429: return TooManyRequests("429 Too Many Requests") if not res.ok: logger.debug(res.text[:200]) return False try: res_json = res.json() except json.decoder.JSONDecodeError: logger.debug(res.text[:200]) return False try: elements = res_json["data"]["identityDashProfileComponentsBySectionType"][ "elements" ][0]["components"]["pagedListComponent"]["components"]["elements"] except (KeyError, IndexError, TypeError) as e: logger.debug(res_json) return False staff.schools = self.parse_schools(elements) return True def parse_schools(self, elements): schools = [] start = end = None for elem in elements: entity = elem["components"]["entityComponent"] if not entity: break years = entity["caption"]["text"] if entity["caption"] else None school_name = entity["titleV2"]["text"]["text"] if years: start, end = parse_dates(years) degree = entity["subtitle"]["text"] if entity["subtitle"] else None school = School( start_date=start, end_date=end, school=school_name, degree=degree ) schools.append(school) return schools ================================================ FILE: staffspy/linkedin/skills.py ================================================ import json import logging from staffspy.utils.exceptions import TooManyRequests from staffspy.utils.models import Skill, Staff logger = logging.getLogger(__name__) class SkillsFetcher: def __init__(self, session): self.session = session self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerIdentityDashProfileComponents.277ba7d7b9afffb04683953cede751fb&queryName=ProfileComponentsBySectionType&variables=(tabIndex:0,sectionType:skills,profileUrn:urn%3Ali%3Afsd_profile%3A{employee_id},count:50)" def fetch_skills(self, staff: Staff): ep = self.endpoint.format(employee_id=staff.id) res = self.session.get(ep) logger.debug(f"skills, status code - {res.status_code}") if res.status_code == 429: return TooManyRequests("429 Too Many Requests") if not res.ok: logger.debug(res.text[:200]) return False try: res_json = res.json() except json.decoder.JSONDecodeError: logger.debug(res.text[:200]) return False if res_json.get("errors"): return False tab_comp = res_json["data"]["identityDashProfileComponentsBySectionType"][ "elements" ][0]["components"]["tabComponent"] if tab_comp: sections = tab_comp["sections"] staff.skills = self.parse_skills(sections) return True def parse_skills(self, sections): names = set() skills = [] for section in sections: elems = section["subComponent"]["components"]["pagedListComponent"][ "components" ]["elements"] for elem in elems: passed_assessment, endorsements = None, 0 entity = elem["components"]["entityComponent"] name = entity["titleV2"]["text"]["text"] if name in names: continue names.add(name) components = entity["subComponents"]["components"] for component in components: try: candidate = component["components"]["insightComponent"]["text"][ "text" ]["text"] if " endorsements" in candidate: endorsements = int(candidate.replace(" endorsements", "")) if "Passed LinkedIn Skill Assessment" in candidate: passed_assessment = True except: pass skills.append( Skill( name=name, endorsements=endorsements, passed_assessment=passed_assessment, ) ) return skills ================================================ FILE: staffspy/solvers/capsolver.py ================================================ import json import time import requests from tenacity import retry, stop_after_attempt, retry_if_result from staffspy.solvers.solver import Solver def is_none(value): return value is None class CapSolver(Solver): """https://www.capsolver.com/""" @retry(stop=stop_after_attempt(10), retry=retry_if_result(is_none)) def solve(self, blob_data: str, page_url: str = None): from staffspy.utils.utils import logger logger.info(f"Waiting on CapSolver to solve captcha...") payload = { "clientKey": self.solver_api_key, "task": { "type": "FunCaptchaTaskProxyLess", "websitePublicKey": self.public_key, "websiteURL": self.page_url, "data": json.dumps({"blob": blob_data}) if blob_data else "", }, } res = requests.post("https://api.capsolver.com/createTask", json=payload) resp = res.json() task_id = resp.get("taskId") if not task_id: raise Exception( "CapSolver failed to create task, try another captcha solver like 2Captcha if this persists or use browser sign in `pip install staffspy[browser]` and then remove the username/password params to the LinkedInAccount()", res.text, ) logger.info(f"Received captcha solver taskId: {task_id} / Getting result...") while True: time.sleep(1) # delay payload = {"clientKey": self.solver_api_key, "taskId": task_id} res = requests.post("https://api.capsolver.com/getTaskResult", json=payload) resp = res.json() status = resp.get("status") if status == "ready": logger.info(f"CapSolver finished solving captcha") return resp.get("solution", {}).get("token") if status == "failed" or resp.get("errorId"): logger.info(f"Captcha solve failed! response: {res.text}") return None ================================================ FILE: staffspy/solvers/solver.py ================================================ from abc import ABC,abstractmethod class Solver(ABC): public_key = "3117BF26-4762-4F5A-8ED9-A85E69209A46" page_url = "https://iframe.arkoselabs.com" def __init__(self, solver_api_key:str): self.solver_api_key=solver_api_key @abstractmethod def solve(self, blob_data: str, page_ur: str=None): pass ================================================ FILE: staffspy/solvers/solver_type.py ================================================ from enum import Enum class SolverType(Enum): CAPSOLVER = 'capsolver' TWO_CAPTCHA = 'twocaptcha' ================================================ FILE: staffspy/solvers/two_captcha.py ================================================ from tenacity import retry_if_exception_type, stop_after_attempt, retry from twocaptcha import TwoCaptcha, TimeoutException, ApiException, NetworkException from staffspy.solvers.solver import Solver class TwoCaptchaSolver(Solver): """https://2captcha.com/""" attempt = 1 @retry( stop=stop_after_attempt(5), retry=retry_if_exception_type( (TimeoutException, ApiException, NetworkException) ), ) def solve(self, blob_data: str, page_url: str = None): super().solve(blob_data, page_url) from staffspy.utils.utils import logger logger.info( f"Waiting on 2Captcha to solve captcha attempt {self.attempt} / 5 ..." ) self.attempt += 1 solver = TwoCaptcha(self.solver_api_key) result = solver.funcaptcha( sitekey=self.public_key, url=page_url, **{"data[blob]": blob_data}, surl="https://iframe.arkoselabs.com", ) logger.info(f"2Captcha finished solving captcha") return result["code"] ================================================ FILE: staffspy/utils/driver_type.py ================================================ from enum import Enum from typing import Optional class BrowserType(Enum): CHROME = "chrome" FIREFOX = "firefox" class DriverType: def __init__( self, browser_type: BrowserType, executable_path: Optional[str] = None ): self.browser_type = browser_type self.executable_path = executable_path ================================================ FILE: staffspy/utils/exceptions.py ================================================ class TooManyRequests(Exception): """Too many requests.""" class BadCookies(Exception): """Login expiration.""" class GeoUrnNotFound(Exception): """Could not find geo urn for given location.""" class BlobException(Exception): """Could not find the blob needed to solve the captcha.""" ================================================ FILE: staffspy/utils/models.py ================================================ from datetime import datetime, date from pydantic import BaseModel from datetime import datetime as dt from staffspy.utils.utils import extract_emails_from_text class Comment(BaseModel): post_id: str comment_id: str | None = None internal_profile_id: str | None = None public_profile_id: str | None = None name: str | None = None text: str | None = None num_likes: int | None = None created_at: dt | None = None def to_dict(self): return { "post_id": self.post_id, "comment_id": self.comment_id, "internal_profile_id": self.internal_profile_id, "public_profile_id": self.public_profile_id, "name": self.name, "text": self.text, "num_likes": self.num_likes, "created_at": self.created_at, } class School(BaseModel): start_date: date | None = None end_date: date | None = None school: str | None = None degree: str | None = None def to_dict(self): return { "start_date": self.start_date.isoformat() if self.start_date else None, "end_date": self.end_date.isoformat() if self.end_date else None, "school": self.school, "degree": self.degree, } class Skill(BaseModel): name: str | None = None endorsements: int | None = None passed_assessment: bool | None = None def to_dict(self): return { "name": self.name, "endorsements": self.endorsements if self.endorsements else 0, "passed_assessment": self.passed_assessment, } class ContactInfo(BaseModel): email_address: str | None = None websites: list | None = None phone_numbers: list | None = None address: str | None = None birthday: str | None = None created_at: str | None = None def to_dict(self): return { "email_address": self.email_address, "websites": self.websites, "phone_numbers": self.phone_numbers, "address": self.address, "birthday": self.birthday, "created_at": self.created_at, } class Certification(BaseModel): title: str | None = None issuer: str | None = None date_issued: str | None = None cert_id: str | None = None cert_link: str | None = None def to_dict(self): return { "title": self.title, "issuer": self.issuer, "date_issued": self.date_issued, "cert_id": self.cert_id, "cert_link": self.cert_link, } class Experience(BaseModel): duration: str | None = None title: str | None = None company: str | None = None location: str | None = None emp_type: str | None = None start_date: date | None = None end_date: date | None = None def to_dict(self): return { "start_date": self.start_date.isoformat() if self.start_date else None, "end_date": self.end_date.isoformat() if self.end_date else None, "duration": self.duration, "title": self.title, "company": self.company, "location": self.location, "emp_type": self.emp_type, } class Staff(BaseModel): urn: str | None = None search_term: str id: str name: str | None = None headline: str | None = None current_position: str | None = None profile_id: str | None = None profile_link: str | None = None first_name: str | None = None last_name: str | None = None potential_emails: list | None = None bio: str | None = None emails_in_bio: str | None = None followers: int | None = None connections: int | None = None mutual_connections: int | None = None is_connection: str | None = None # yes, no, pending location: str | None = None company: str | None = None school: str | None = None influencer: bool | None = None creator: bool | None = None premium: bool | None = None open_to_work: bool | None = None is_hiring: bool | None = None profile_photo: str | None = None banner_photo: str | None = None skills: list[Skill] | None = None experiences: list[Experience] | None = None certifications: list[Certification] | None = None contact_info: ContactInfo | None = None schools: list[School] | None = None languages: list[str] | None = None def get_top_skills(self): top_three_skills = [] if self.skills: sorted_skills = sorted( self.skills, key=lambda x: x.endorsements, reverse=True ) top_three_skills = [skill.name for skill in sorted_skills[:3]] top_three_skills += [None] * (3 - len(top_three_skills)) return top_three_skills def to_dict(self): sorted_schools = ( sorted( self.schools, key=lambda x: (x.end_date is None, x.end_date), reverse=True, ) if self.schools else [] ) top_three_school_names = [school.school for school in sorted_schools[:3]] top_three_school_names += [None] * (3 - len(top_three_school_names)) estimated_age = self.estimate_age_based_on_education() sorted_experiences = ( sorted( self.experiences, key=lambda x: (x.end_date is None, x.end_date), reverse=True, ) if self.experiences else [] ) top_three_companies = [] seen_companies = set() for exp in sorted_experiences: if exp.company not in seen_companies: top_three_companies.append(exp.company) seen_companies.add(exp.company) if len(top_three_companies) == 3: break top_three_companies += [None] * (3 - len(top_three_companies)) top_three_skills = self.get_top_skills() self.emails_in_bio = extract_emails_from_text(self.bio) if self.bio else None self.current_position = ( sorted_experiences[0].title if len(sorted_experiences) > 0 and sorted_experiences[0].end_date is None else None ) contact_info = self.contact_info.to_dict() if self.contact_info else {} return { "search_term": self.search_term, "id": self.id, "urn": self.urn, "profile_link": self.profile_link, "profile_id": self.profile_id, "name": self.name, "first_name": self.first_name, "last_name": self.last_name, "location": self.location, "headline": self.headline, "estimated_age": estimated_age, "followers": self.followers, "connections": self.connections, "mutuals": self.mutual_connections, "is_connection": self.is_connection, "premium": self.premium, "creator": self.creator, "influencer": self.influencer, "open_to_work": self.open_to_work, "is_hiring": self.is_hiring, "current_position": self.current_position, "current_company": top_three_companies[0], "past_company_1": top_three_companies[1], "past_company_2": top_three_companies[2], "school_1": top_three_school_names[0], "school_2": top_three_school_names[1], "top_skill_1": top_three_skills[0], "top_skill_2": top_three_skills[1], "top_skill_3": top_three_skills[2], "bio": self.bio, "experiences": ( [exp.to_dict() for exp in self.experiences] if self.experiences else None ), "schools": ( [school.to_dict() for school in self.schools] if self.schools else None ), "skills": ( [skill.to_dict() for skill in self.skills] if self.skills else None ), "certifications": ( [cert.to_dict() for cert in self.certifications] if self.certifications else None ), "languages": self.languages, "emails_in_bio": ( ", ".join(self.emails_in_bio) if self.emails_in_bio else None ), "potential_emails": self.potential_emails, "profile_photo": self.profile_photo, "banner_photo": self.banner_photo, "connection_created_at": contact_info.get("created_at"), "connection_email": contact_info.get("email_address"), "connection_phone_numbers": contact_info.get("phone_numbers"), "connection_websites": contact_info.get("websites"), "connection_street_address": contact_info.get("address"), "connection_birthday": contact_info.get("birthday"), } def estimate_age_based_on_education(self): """Adds 18 to their first college start date""" college_words = ["uni", "college"] sorted_schools = ( sorted( [school for school in self.schools if school.start_date], key=lambda x: x.start_date, ) if self.schools else [] ) current_date = datetime.now().date() for school in sorted_schools: if ( any(word in school.school.lower() for word in college_words) or school.degree ): if school.start_date: years_in_education = (current_date - school.start_date).days // 365 return int(18 + years_in_education) return None ================================================ FILE: staffspy/utils/utils.py ================================================ import logging import os import pickle import re from datetime import datetime import pandas as pd from typing import Optional from urllib.parse import quote import requests import tldextract from bs4 import BeautifulSoup from dateutil.parser import parse from tenacity import stop_after_attempt, retry_if_exception_type, retry, RetryError from staffspy.solvers.solver import Solver from staffspy.utils.driver_type import DriverType, BrowserType from staffspy.utils.exceptions import BlobException logger = logging.getLogger("StaffSpy") logger.propagate = False if not logger.handlers: logger.setLevel(logging.INFO) console_handler = logging.StreamHandler() format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" formatter = logging.Formatter(format) console_handler.setFormatter(formatter) logger.addHandler(console_handler) def set_csrf_token(session): csrf_token = session.cookies["JSESSIONID"].replace('"', "") session.headers.update({"Csrf-Token": csrf_token}) return session def extract_base_domain(url: str): extracted = tldextract.extract(url) base_domain = "{}.{}".format(extracted.domain, extracted.suffix) return base_domain def create_emails(first, last, domain): first = "".join(filter(str.isalpha, first)).lower() last = "".join(filter(str.isalpha, last)).lower() emails = [ f"{first}.{last}@{domain}", f"{first[:1]}{last}@{domain}", f"{first[:2]}{last}@{domain}", f"{first}{last[:1]}@{domain}", f"{first}{last[:2]}@{domain}", ] return emails def get_webdriver(driver_type: Optional[DriverType] = None): try: from selenium import webdriver from selenium.webdriver.chrome.service import Service as ChromeService from selenium.webdriver.firefox.service import Service as FirefoxService except ImportError as e: raise Exception( 'install package `pip install "staffspy[browser]"` to login with browser' ) if driver_type: if str(driver_type.browser_type) == str(BrowserType.CHROME): if driver_type.executable_path: service = ChromeService(executable_path=driver_type.executable_path) return webdriver.Chrome(service=service) else: return webdriver.Chrome() elif str(driver_type.browser_type) == str(BrowserType.FIREFOX): if driver_type.executable_path: service = FirefoxService(executable_path=driver_type.executable_path) return webdriver.Firefox(service=service) else: return webdriver.Firefox() else: for browser in [webdriver.Chrome, webdriver.Firefox]: try: return browser() except Exception: continue return None class Login: def __init__( self, username: str, password: str, solver: Solver, session_file: str, driver_type: DriverType = None, ): ( self.username, self.password, self.solver, self.session_file, self.driver_type, ) = (username, password, solver, session_file, driver_type) def solve_captcha(self, session, data, payload): url = data["challenge_url"] r = session.post(url, data=payload) soup = BeautifulSoup(r.text, "html.parser") code_tag = soup.find("code", id="securedDataExchange") logger.info("Searching for captcha blob in linkedin to begin captcha solving") if code_tag: comment = code_tag.contents[0] extracted_code = str(comment).strip('').strip() logger.debug("Extracted captcha blob:", extracted_code) elif "Please choose a more secure password." in r.text: raise Exception( "linkedin is requiring a more secure password. reset pw and try again" ) else: raise BlobException( "blob to solve captcha not found - rerunning the program usually solves this" ) if not self.solver: raise Exception( "captcha hit - provide solver_api_key and solver_service name to solve or switch to the browser-based login with `pip install staffspy[browser]`" ) token = self.solver.solve(extracted_code, url) if not token: raise Exception("failed to solve captcha after 10 attempts") captcha_site_key = soup.find("input", {"name": "captchaSiteKey"})["value"] challenge_id = soup.find("input", {"name": "challengeId"})["value"] challenge_data = soup.find("input", {"name": "challengeData"})["value"] challenge_details = soup.find("input", {"name": "challengeDetails"})["value"] challenge_type = soup.find("input", {"name": "challengeType"})["value"] challenge_source = soup.find("input", {"name": "challengeSource"})["value"] request_submission_id = soup.find("input", {"name": "requestSubmissionId"})[ "value" ] display_time = soup.find("input", {"name": "displayTime"})["value"] page_instance = soup.find("input", {"name": "pageInstance"})["value"] failure_redirect_uri = soup.find("input", {"name": "failureRedirectUri"})[ "value" ] sign_in_link = soup.find("input", {"name": "signInLink"})["value"] join_now_link = soup.find("input", {"name": "joinNowLink"})["value"] for cookie in session.cookies: if cookie.name == "JSESSIONID": jsession_value = cookie.value.split("ajax:")[1].strip('"') break else: raise Exception("jsessionid not found, raise issue on GitHub") csrf_token = f"ajax:{jsession_value}" payload = { "csrfToken": csrf_token, "captchaSiteKey": captcha_site_key, "challengeId": challenge_id, "language": "en-US", "displayTime": display_time, "challengeType": challenge_type, "challengeSource": challenge_source, "requestSubmissionId": request_submission_id, "captchaUserResponseToken": token, "challengeData": challenge_data, "pageInstance": page_instance, "challengeDetails": challenge_details, "failureRedirectUri": failure_redirect_uri, "signInLink": sign_in_link, "joinNowLink": join_now_link, "_s": "CONSUMER_LOGIN", } encoded_payload = { key: f'{quote(str(value), "")}' for key, value in payload.items() } query_string = "&".join( [f"{key}={value}" for key, value in encoded_payload.items()] ) response = session.post( "https://www.linkedin.com/checkpoint/challenge/verify", data=query_string ) if not response.ok: raise Exception(f"verify captcha failed {response.text[:200]}") @retry(stop=stop_after_attempt(5), retry=retry_if_exception_type(BlobException)) def login_requests(self): url = "https://www.linkedin.com/uas/authenticate" encoded_username = quote(self.username) encoded_password = quote(self.password) session = requests.Session() session.headers = { "X-Li-User-Agent": "LIAuthLibrary:44.0.* com.linkedin.LinkedIn:9.29.8962 iPhone:17.5.1", "User-Agent": "LinkedIn/9.29.8962 CFNetwork/1496.0.7 Darwin/23.5.0", "X-User-Language": "en", "X-User-Locale": "en_US", "Accept-Language": "en-us", } response = session.get(url) if response.status_code != 200: raise Exception( f"failed to begin auth process: {response.status_code} {response.text}" ) for cookie in session.cookies: if cookie.name == "JSESSIONID": jsession_value = cookie.value.split("ajax:")[1].strip('"') break else: raise Exception("jsessionid not found, raise issue on GitHub") session.headers["content-type"] = "application/x-www-form-urlencoded" csrf_token = f"ajax%3A{jsession_value}" payload = f"session_key={encoded_username}&session_password={encoded_password}&JSESSIONID=%22{csrf_token}%22" response = session.post(url, data=payload) data = response.json() if data["login_result"] == "BAD_USERNAME_OR_PASSWORD": raise Exception("incorrect username or password") elif data["login_result"] == "CHALLENGE": self.solve_captcha(session, data, payload) session = set_csrf_token(session) return session def login_browser(self): """Backup login method""" driver = get_webdriver(self.driver_type) if driver is None: logger.debug("No browser found for selenium") raise Exception("driver not found for selenium") driver.get("https://linkedin.com/login") input("Press enter after logged in") selenium_cookies = driver.get_cookies() driver.quit() session = requests.Session() for cookie in selenium_cookies: session.cookies.set(cookie["name"], cookie["value"]) session = set_csrf_token(session) return session def save_session(self, session, session_file: str): data = {"cookies": session.cookies, "headers": session.headers} with open(session_file, "wb") as f: pickle.dump(data, f) def load_session(self): """Load session from session file, otherwise login""" session = None if not self.session_file or not os.path.exists(self.session_file): if self.username and self.password: try: session = self.login_requests() except RetryError as retry_err: retry_err.reraise() else: session = self.login_browser() if not session: raise Exception("Failed to log in.") if self.session_file: self.save_session(session, self.session_file) else: with open(self.session_file, "rb") as f: data = pickle.load(f) session = requests.Session() session.cookies.update(data["cookies"]) session.headers.update(data["headers"]) session.headers.update( { "User-Agent": "Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; SCH-I535 Build/KOT49H) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30", "X-RestLi-Protocol-Version": "2.0.0", "X-Li-Track": '{"clientVersion":"1.13.1665"}', } ) if not self.check_logged_in(session): raise Exception( "Failed to log in. Likely outdated session file and cookies have expired. Best practice to delete the file and rerun the LinkedAccount() code" ) return session def check_logged_in(self, session): logger.info("Testing if logged in by checking arbitrary LinkedIn company page") try: res = session.get( "https://www.linkedin.com/voyager/api/organization/companies?q=universalName&universalName=amazon" ) if res.status_code != 200: logger.error(f"{res.status_code} status code returned from linkedin") return False except Exception as e: logger.error(f"Failed to get arbitrary company page: {e}") return False logger.info("Account successfully logged in - res code 200") return True def parse_date(date_str): formats = ["%b %Y", "%Y"] for fmt in formats: try: return datetime.strptime(date_str, fmt) except ValueError: continue return None def parse_duration(duration): from_date = to_date = None dates = duration.split(" · ") if len(dates) > 1: date_range, _ = duration.split(" · ") dates = date_range.split(" - ") from_date_str = dates[0] to_date_str = dates[1] if dates[1] != "Present" else None from_date = parse_date(from_date_str) if from_date_str else None to_date = parse_date(to_date_str) if to_date_str else None return from_date, to_date def set_logger_level(verbose: int = 0): """ Adjusts the logger's level. This function allows the logging level to be changed at runtime. Parameters: - verbose: int {0, 1, 2} (default=0, no logs) """ if verbose is None: return level_name = {2: "DEBUG", 1: "INFO", 0: "WARNING"}.get(verbose, "INFO") level = getattr(logging, level_name.upper(), None) if level is not None: logger.setLevel(level) else: raise ValueError(f"Invalid log level: {level_name}") def parse_dates(date_str): regex = r"(\b\w+ \d{4}|\b\d{4}|\bPresent)" matches = re.findall(regex, date_str) start_date, end_date = None, None if matches: if "Present" in matches: if len(matches) == 1: start_date = None end_date = None else: start_date = parse(matches[0]).date() end_date = None else: if len(matches) == 2: start_date = parse(matches[0]).date() end_date = parse(matches[1]).date() elif len(matches) == 1: start_date = parse(matches[0]).date() return start_date, end_date def extract_emails_from_text(text: str) -> list[str] | None: if not text: return None email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}") return email_regex.findall(text) def parse_company_data(json_data, search_term=None): company_info = json_data["elements"][0] company_name = company_info.get("name", "") staff_count = company_info.get("staffCount", None) company_type = company_info.get("type", "") description = company_info.get("description", "") industries_list = [ ind.get("localizedName", "") for ind in company_info.get("companyIndustries", []) ] headquarter = company_info.get("headquarter", {}) headquarter_full = f'{headquarter.get("line1", "")}, {headquarter.get("city", "")}, {headquarter.get("country", "")} {headquarter.get("postalCode", "")}' logo_data = company_info.get("logo", {}) vector_image = logo_data.get("image", {}).get("com.linkedin.common.VectorImage", {}) root_url = vector_image.get("rootUrl", "") artifacts = vector_image.get("artifacts", []) logo_url = None if artifacts: first_artifact = artifacts[0] file_path = first_artifact.get("fileIdentifyingUrlPathSegment", "") logo_url = root_url + file_path tracking_info = company_info.get("trackingInfo", {}) object_urn = tracking_info.get("objectUrn", "") internal_id = None if object_urn.startswith("urn:li:company:"): internal_id = object_urn.split(":")[-1] bg_photo = company_info.get("backgroundCoverPhoto", {}) vector_image = bg_photo.get("com.linkedin.common.VectorImage", {}) root_url = vector_image.get("rootUrl", "") artifacts = vector_image.get("artifacts", []) banner_url = None if artifacts: chosen_artifact = artifacts[0] file_segment = chosen_artifact.get("fileIdentifyingUrlPathSegment", "") banner_url = root_url + file_segment company_df = pd.DataFrame( { "search_term": [search_term], "linkedin_company_id": [internal_id], "company_name": [company_name], "staff_count": [staff_count], "company_type": [company_type], "industries": [industries_list], "headquarters_address": [headquarter_full], "description": [description], "logo_url": [logo_url], "banner_url": [banner_url], } ) return company_df def clean_df(staff_df): if "estimated_age" in staff_df.columns: staff_df["estimated_age"] = staff_df["estimated_age"].astype("Int64") if "followers" in staff_df.columns: staff_df["followers"] = staff_df["followers"].astype("Int64") if "connections" in staff_df.columns: staff_df["connections"] = staff_df["connections"].astype("Int64") if "mutuals" in staff_df.columns: staff_df["mutuals"] = staff_df["mutuals"].astype("Int64") return staff_df def upload_to_clay(webhook_url: str, data: pd.DataFrame): records = data.to_dict("records") responses = [] for i, row in enumerate(records, start=1): try: response = requests.post( webhook_url, headers={"Accept": "application/json"}, json=row ) response.raise_for_status() logger.info(f"Uploaded row to Clay: {i} / {len(records)}") except requests.exceptions.RequestException as e: logger.error(f"Failed to upload row to Clay: {str(e)}") responses.append({"error": str(e), "data": row}) return responses if __name__ == "__main__": p = parse_dates("May 2018 - Jun 2024")