Repository: neubig/research-career-tools Branch: main Commit: b2eaffe6642c Files: 9 Total size: 13.8 KB Directory structure: gitextract_we871cek/ ├── LICENSE ├── README.md ├── find_my_citers/ │ ├── README.md │ ├── find_my_citers.py │ └── requirements.txt └── find_recent_additions/ ├── .gitignore ├── README.md ├── find_recent_additions.py └── requirements.txt ================================================ FILE CONTENTS ================================================ ================================================ FILE: LICENSE ================================================ Copyright 2024 Graham Neubig Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Research Career Tools 🔬 Organized by [Graham Neubig](http://www.phontron.com). This is a set of scripts that may be helpful to people looking for a career in research. * [Find My Citers](find_my_citers/): This script helps find people who have cited your papers, which can be useful to find recommendation letter writers for job or visa applications. ================================================ FILE: find_my_citers/README.md ================================================ # Find My Citers by [Graham Neubig](http://www.phontron.com) This is a script that helps find people who have cited your papers, which can be useful to find recommendation letter writers for job or visa applications. It is based on the [Semantic Scholar API](https://api.semanticscholar.org/api-docs/). ## Usage Install requirements, mostly `pys2`, a python library for the [Semantic Scholar (S2) API](api.semanticscholar.org/). ```bash pip install -r requirements.txt ``` Find your semantic scholar profile, and copy-paste the number from the URL. For example, if the URL is `https://www.semanticscholar.org/author/Graham-Neubig/1700325` then the number is `1700325`. ```bash python find_my_citers.py --author_id AUTHOR_ID ``` Optionally, if you have a semantic scholar API key, you can add it to make the process go faster. ```bash python find_my_citers.py --author_id AUTHOR_ID --s2_api_key API_KEY ``` ================================================ FILE: find_my_citers/find_my_citers.py ================================================ import argparse from collections import defaultdict, Counter import csv import time import matplotlib.pyplot as plt from semanticscholar import SemanticScholar from tenacity import RetryError from tqdm import tqdm sch: SemanticScholar | None = None def get_author_name(author_id: str) -> str: """Fetch the name of the author given the author ID.""" author_details = sch.get_author(author_id) return author_details.name.replace(" ", "_") def get_author_papers(author_id: str) -> list[dict]: """Fetch papers for a given author ID from Semantic Scholar.""" author = sch.get_author(author_id) return [{"title": paper.title, "paperId": paper.paperId} for paper in author.papers] def get_citations(paper_id: str) -> list[dict]: """Fetch citations for a given paper ID from Semantic Scholar.""" paper_details = sch.get_paper(paper_id) return [ { "title": citation.title, "paperId": citation.paperId, "year": citation.year, "authors": citation.authors, } for citation in paper_details.citations ] def process_citations(papers: list[dict], citation_counts: defaultdict, citation_years: list, desc: str = "Papers") -> list[dict]: """Process citations for a list of papers, returning any that failed.""" failed = [] for paper in (pbar := tqdm(papers, desc=desc, unit="paper")): pbar.set_postfix_str(f"fetching: {paper['title'][:40]}") try: citations = get_citations(paper["paperId"]) except RetryError: tqdm.write(f"Rate limit exceeded for '{paper['title']}', will retry.") failed.append(paper) time.sleep(10) continue for citation in tqdm( citations, desc=paper["title"][:50], unit="citation", leave=False ): for author in citation["authors"]: author_name = ( author.get("name") if isinstance(author, dict) else author.name ) citation_counts[author_name] += 1 if citation["year"] is not None: citation_years.append(citation["year"]) time.sleep(1) return failed def find_my_citers(author_id: str) -> list[tuple[str, int]]: your_paper_ids = get_author_papers(author_id) citation_counts = defaultdict(int) citation_years = [] failed = process_citations(your_paper_ids, citation_counts, citation_years) if failed: tqdm.write(f"\nRetrying {len(failed)} failed paper(s) after a cooldown...") time.sleep(60) still_failed = process_citations(failed, citation_counts, citation_years, desc="Retrying") if still_failed: tqdm.write(f"Warning: {len(still_failed)} paper(s) permanently skipped due to rate limits: " + ", ".join(p["title"] for p in still_failed)) sorted_citation_counts = sorted( citation_counts.items(), key=lambda item: item[1], reverse=True ) return sorted_citation_counts, citation_years def export_citation_data(sorted_citation_counts, author_name): """Export citation data to a CSV file named after the author.""" filename = f"{author_name}_citation_data.csv" with open(filename, "w", newline="") as file: writer = csv.writer(file) writer.writerow(["Author", "Citation Count"]) writer.writerows(sorted_citation_counts) print(f"Citation data exported to {filename}") return filename def plot_citation_trends(citation_years, author_name): """Create and save a time-series plot of citation trends over time.""" year_counts = Counter(citation_years) years = sorted(year_counts.keys()) counts = [year_counts[year] for year in years] plt.figure(figsize=(10, 6)) plt.plot(years, counts, marker="o") plt.title(f"Citation Trends Over Time for {author_name}") plt.xlabel("Year") plt.ylabel("Number of Citations") plt.tight_layout() plot_filename = f"{author_name}_citation_trends.png" plt.savefig(plot_filename) plt.close() print(f"Citation trend plot saved as {plot_filename}") return plot_filename if __name__ == "__main__": parser = argparse.ArgumentParser( description="Find authors who have cited your work the most using PyS2" ) parser.add_argument( "--author_id", help=( "The author ID to search for. " "If not provided, the script will prompt for input." ), default=None, ) parser.add_argument( "--s2_api_key", type=str, default=None, help="An API key for semantic scholar if you have one.", ) args = parser.parse_args() sch = SemanticScholar(api_key=args.s2_api_key) if args.author_id is None: author_id = input("Enter the author ID: ") else: author_id = args.author_id author_name = get_author_name(author_id) sorted_citation_counts, citation_years = find_my_citers(author_id) csv_filename = export_citation_data(sorted_citation_counts, author_name) plot_filename = plot_citation_trends(citation_years, author_name) ================================================ FILE: find_my_citers/requirements.txt ================================================ semanticscholar matplotlib tqdm ================================================ FILE: find_recent_additions/.gitignore ================================================ CSrankings/ faculty_info.csv __pycache__/ .pytest_cache/ ================================================ FILE: find_recent_additions/README.md ================================================ # CSRankings Recent Additions Tracker This script analyzes the CSRankings database to track recent additions to university faculty members, including their publication counts and research areas. ## Description The script processes the CSRankings database to extract: - Faculty member names and affiliations - When they were added to the database - Their research areas - Publication counts ## Prerequisites - Python 3.6+ - Git ## Installation First, clone the CSRankings repository (required for data): ```bash git clone https://github.com/emeryberger/CSrankings.git ``` Second, install required Python packages: ```bash pip install -r requirements.txt ``` ## Usage First, make sure the CSRankings repository is in the same directory as the script then run it: ```bash python find_recent_additions.py ``` The script will generate a `faculty_info.csv` file containing: - Name - Affiliation - Research areas - Date added to CSRankings - Publication count ## Output Format The output CSV file contains the following columns: - `name`: Faculty member's name - `affiliation`: University affiliation - `areas`: Research areas - `added_date`: Date when added to CSRankings - `publication_count`: Number of publications ## Performance The script uses multiprocessing to parallelize faculty data processing, utilizing all available CPU cores for improved performance. ## Dependencies - `csv`: For CSV file handling - `multiprocessing`: For parallel processing - `subprocess`: For Git operations - `tqdm`: For progress bars ## Notes - The script requires the CSRankings repository to be up-to-date for accurate addition dates - Git blame is used to determine when entries were added to the database ================================================ FILE: find_recent_additions/find_recent_additions.py ================================================ import csv import subprocess from collections import defaultdict from datetime import datetime from tqdm import tqdm import re import asyncio from asyncio import Queue def load_git_history(): git_history_cache = {} # Get git blame information cmd = ['git', 'blame', '--date=iso', 'csrankings.csv'] blame_output = subprocess.check_output(cmd, cwd='CSrankings', text=True) # Regex pattern to match the date format in git blame output date_pattern = r'\((.*?)\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\s+[+-]\d{4})' for line in blame_output.split('\n'): if not line: continue match = re.search(date_pattern, line) if match and len(line.split(')')) > 1: date_str = match.group(2) fields = line.split(')')[-1].strip().split(',') if len(fields) >= 1: name = fields[0].strip() git_history_cache[name] = date_str return git_history_cache def read_homepage_data(): homepage_data = {} with open('CSrankings/homepage-validated.csv', 'r', encoding='utf-8') as f: reader = csv.reader(f) for row in reader: if len(row) >= 2: homepage_data[row[0]] = row[1] return homepage_data def read_author_info(): pub_counts = defaultdict(int) author_conferences = defaultdict(set) with open('CSrankings/generated-author-info.csv', 'r', encoding='utf-8') as f: reader = csv.reader(f) next(reader) # Skip header for row in reader: if len(row) >= 4: author_name = row[0].strip() conference = row[2].strip() try: count = float(row[3]) pub_counts[author_name] += count author_conferences[author_name].add(conference) except ValueError: continue return pub_counts, author_conferences def read_country_info(): country_data = {} with open('CSrankings/country-info.csv', 'r', encoding='utf-8') as f: reader = csv.reader(f) for row in reader: if len(row) >= 3: institution = row[0].strip() region = row[1].strip() country = row[2].strip() country_data[institution] = f"{region}-{country}" return country_data async def process_faculty_member(name, affiliation, homepage, homepage_data, pub_counts, conferences, git_history, country_data): # Get the last modified date using git log added_date = git_history.get(name, '') # Get publication count and conferences pub_count = pub_counts.get(name, 0) conf_list = ','.join(sorted(conferences.get(name, set()))) # Get affiliation location affiliation_location = country_data.get(affiliation, 'us-us') return { 'name': name, 'affiliation': affiliation, 'affiliation_location': affiliation_location, 'added_date': added_date, 'publication_count': pub_count, 'conferences': conf_list } async def process_faculty_files(): faculty_data = [] homepage_data = read_homepage_data() pub_counts, author_conferences = read_author_info() country_data = read_country_info() print("Loading git history...") git_history = load_git_history() # Process the combined csrankings.csv file file_path = 'CSrankings/csrankings.csv' with open(file_path, 'r', encoding='utf-8') as f: reader = csv.reader(f) next(reader) # Skip header rows = list(reader) # Create tasks for async processing tasks = [] for row in rows: if len(row) >= 3: name = row[0].strip() affiliation = row[1].strip() homepage = row[2].strip() tasks.append(process_faculty_member(name, affiliation, homepage, homepage_data, pub_counts, author_conferences, git_history, country_data)) print(f"Processing {len(tasks)} faculty members...") # Process faculty members concurrently faculty_data = await asyncio.gather(*tasks) return faculty_data def write_output(faculty_data): with open('faculty_info.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=['name', 'affiliation', 'affiliation_location', 'added_date', 'publication_count', 'conferences']) writer.writeheader() writer.writerows(faculty_data) async def main(): faculty_data = await process_faculty_files() write_output(faculty_data) print(f"Processed {len(faculty_data)} faculty entries") if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: find_recent_additions/requirements.txt ================================================ tqdm pytest