[
  {
    "path": "LICENSE",
    "content": "Copyright 2024 Graham Neubig\n\nPermission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# Research Career Tools 🔬\n\nOrganized by [Graham Neubig](http://www.phontron.com).\n\nThis is a set of scripts that may be helpful to people looking for a career in research.\n\n* [Find My Citers](find_my_citers/): This script helps find people who have cited your papers, which can be useful to find recommendation letter writers for job or visa applications.\n"
  },
  {
    "path": "find_my_citers/README.md",
    "content": "# Find My Citers\n\nby [Graham Neubig](http://www.phontron.com)\n\nThis is a script that helps find people who have cited your papers, which can be useful to find recommendation letter writers for job or visa applications.\n\nIt is based on the [Semantic Scholar API](https://api.semanticscholar.org/api-docs/).\n\n## Usage\n\nInstall requirements, mostly `pys2`,\na python library for the [Semantic Scholar (S2) API](api.semanticscholar.org/).\n\n```bash\npip install -r requirements.txt\n```\n\nFind your semantic scholar profile, and copy-paste the number from the URL. For example, if the URL is `https://www.semanticscholar.org/author/Graham-Neubig/1700325` then the number is `1700325`.\n\n```bash\npython find_my_citers.py --author_id AUTHOR_ID\n```\n\nOptionally, if you have a semantic scholar API key, you can add it to make the process go faster.\n\n```bash\npython find_my_citers.py --author_id AUTHOR_ID --s2_api_key API_KEY\n```\n"
  },
  {
    "path": "find_my_citers/find_my_citers.py",
    "content": "import argparse\nfrom collections import defaultdict, Counter\nimport csv\nimport time\nimport matplotlib.pyplot as plt\nfrom semanticscholar import SemanticScholar\nfrom tenacity import RetryError\nfrom tqdm import tqdm\n\nsch: SemanticScholar | None = None\n\n\ndef get_author_name(author_id: str) -> str:\n    \"\"\"Fetch the name of the author given the author ID.\"\"\"\n    author_details = sch.get_author(author_id)\n    return author_details.name.replace(\" \", \"_\")\n\n\ndef get_author_papers(author_id: str) -> list[dict]:\n    \"\"\"Fetch papers for a given author ID from Semantic Scholar.\"\"\"\n    author = sch.get_author(author_id)\n    return [{\"title\": paper.title, \"paperId\": paper.paperId} for paper in author.papers]\n\n\ndef get_citations(paper_id: str) -> list[dict]:\n    \"\"\"Fetch citations for a given paper ID from Semantic Scholar.\"\"\"\n    paper_details = sch.get_paper(paper_id)\n    return [\n        {\n            \"title\": citation.title,\n            \"paperId\": citation.paperId,\n            \"year\": citation.year,\n            \"authors\": citation.authors,\n        }\n        for citation in paper_details.citations\n    ]\n\n\ndef process_citations(papers: list[dict], citation_counts: defaultdict, citation_years: list, desc: str = \"Papers\") -> list[dict]:\n    \"\"\"Process citations for a list of papers, returning any that failed.\"\"\"\n    failed = []\n    for paper in (pbar := tqdm(papers, desc=desc, unit=\"paper\")):\n        pbar.set_postfix_str(f\"fetching: {paper['title'][:40]}\")\n        try:\n            citations = get_citations(paper[\"paperId\"])\n        except RetryError:\n            tqdm.write(f\"Rate limit exceeded for '{paper['title']}', will retry.\")\n            failed.append(paper)\n            time.sleep(10)\n            continue\n        for citation in tqdm(\n            citations, desc=paper[\"title\"][:50], unit=\"citation\", leave=False\n        ):\n            for author in citation[\"authors\"]:\n                author_name = (\n                    author.get(\"name\") if isinstance(author, dict) else author.name\n                )\n                citation_counts[author_name] += 1\n            if citation[\"year\"] is not None:\n                citation_years.append(citation[\"year\"])\n        time.sleep(1)\n    return failed\n\n\ndef find_my_citers(author_id: str) -> list[tuple[str, int]]:\n    your_paper_ids = get_author_papers(author_id)\n    citation_counts = defaultdict(int)\n    citation_years = []\n\n    failed = process_citations(your_paper_ids, citation_counts, citation_years)\n    if failed:\n        tqdm.write(f\"\\nRetrying {len(failed)} failed paper(s) after a cooldown...\")\n        time.sleep(60)\n        still_failed = process_citations(failed, citation_counts, citation_years, desc=\"Retrying\")\n        if still_failed:\n            tqdm.write(f\"Warning: {len(still_failed)} paper(s) permanently skipped due to rate limits: \"\n                       + \", \".join(p[\"title\"] for p in still_failed))\n\n    sorted_citation_counts = sorted(\n        citation_counts.items(), key=lambda item: item[1], reverse=True\n    )\n\n    return sorted_citation_counts, citation_years\n\n\ndef export_citation_data(sorted_citation_counts, author_name):\n    \"\"\"Export citation data to a CSV file named after the author.\"\"\"\n    filename = f\"{author_name}_citation_data.csv\"\n    with open(filename, \"w\", newline=\"\") as file:\n        writer = csv.writer(file)\n        writer.writerow([\"Author\", \"Citation Count\"])\n        writer.writerows(sorted_citation_counts)\n    print(f\"Citation data exported to {filename}\")\n    return filename\n\n\ndef plot_citation_trends(citation_years, author_name):\n    \"\"\"Create and save a time-series plot of citation trends over time.\"\"\"\n    year_counts = Counter(citation_years)\n    years = sorted(year_counts.keys())\n    counts = [year_counts[year] for year in years]\n\n    plt.figure(figsize=(10, 6))\n    plt.plot(years, counts, marker=\"o\")\n    plt.title(f\"Citation Trends Over Time for {author_name}\")\n    plt.xlabel(\"Year\")\n    plt.ylabel(\"Number of Citations\")\n    plt.tight_layout()\n    plot_filename = f\"{author_name}_citation_trends.png\"\n    plt.savefig(plot_filename)\n    plt.close()\n    print(f\"Citation trend plot saved as {plot_filename}\")\n    return plot_filename\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser(\n        description=\"Find authors who have cited your work the most using PyS2\"\n    )\n    parser.add_argument(\n        \"--author_id\",\n        help=(\n            \"The author ID to search for. \"\n            \"If not provided, the script will prompt for input.\"\n        ),\n        default=None,\n    )\n    parser.add_argument(\n        \"--s2_api_key\",\n        type=str,\n        default=None,\n        help=\"An API key for semantic scholar if you have one.\",\n    )\n\n    args = parser.parse_args()\n    sch = SemanticScholar(api_key=args.s2_api_key)\n\n    if args.author_id is None:\n        author_id = input(\"Enter the author ID: \")\n    else:\n        author_id = args.author_id\n\n    author_name = get_author_name(author_id)\n    sorted_citation_counts, citation_years = find_my_citers(author_id)\n    csv_filename = export_citation_data(sorted_citation_counts, author_name)\n    plot_filename = plot_citation_trends(citation_years, author_name)\n"
  },
  {
    "path": "find_my_citers/requirements.txt",
    "content": "semanticscholar\nmatplotlib\ntqdm\n"
  },
  {
    "path": "find_recent_additions/.gitignore",
    "content": "CSrankings/\nfaculty_info.csv\n__pycache__/\n.pytest_cache/"
  },
  {
    "path": "find_recent_additions/README.md",
    "content": "# CSRankings Recent Additions Tracker\n\nThis script analyzes the CSRankings database to track recent additions to university faculty members, including their publication counts and research areas.\n\n## Description\n\nThe script processes the CSRankings database to extract:\n\n- Faculty member names and affiliations\n- When they were added to the database\n- Their research areas\n- Publication counts\n\n## Prerequisites\n\n- Python 3.6+\n- Git\n\n## Installation\n\nFirst, clone the CSRankings repository (required for data):\n\n```bash\ngit clone https://github.com/emeryberger/CSrankings.git\n```\n\nSecond, install required Python packages:\n\n```bash\npip install -r requirements.txt\n```\n\n## Usage\n\nFirst, make sure the CSRankings repository is in the same directory as the script then run it:\n\n```bash\npython find_recent_additions.py\n```\n\nThe script will generate a `faculty_info.csv` file containing:\n\n- Name\n- Affiliation\n- Research areas\n- Date added to CSRankings\n- Publication count\n\n## Output Format\n\nThe output CSV file contains the following columns:\n\n- `name`: Faculty member's name\n- `affiliation`: University affiliation\n- `areas`: Research areas\n- `added_date`: Date when added to CSRankings\n- `publication_count`: Number of publications\n\n## Performance\n\nThe script uses multiprocessing to parallelize faculty data processing, utilizing all available CPU cores for improved performance.\n\n## Dependencies\n\n- `csv`: For CSV file handling\n- `multiprocessing`: For parallel processing\n- `subprocess`: For Git operations\n- `tqdm`: For progress bars\n\n## Notes\n\n- The script requires the CSRankings repository to be up-to-date for accurate addition dates\n- Git blame is used to determine when entries were added to the database\n"
  },
  {
    "path": "find_recent_additions/find_recent_additions.py",
    "content": "import csv\nimport subprocess\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom tqdm import tqdm\nimport re\nimport asyncio\nfrom asyncio import Queue\n\ndef load_git_history():\n    git_history_cache = {}\n    # Get git blame information\n    cmd = ['git', 'blame', '--date=iso', 'csrankings.csv']\n    blame_output = subprocess.check_output(cmd, cwd='CSrankings', text=True)\n    \n    # Regex pattern to match the date format in git blame output\n    date_pattern = r'\\((.*?)\\s+(\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}\\s+[+-]\\d{4})'\n    \n    for line in blame_output.split('\\n'):\n        if not line:\n            continue\n        match = re.search(date_pattern, line)\n        if match and len(line.split(')')) > 1:\n            date_str = match.group(2)\n            fields = line.split(')')[-1].strip().split(',')\n            if len(fields) >= 1:\n                name = fields[0].strip()\n                git_history_cache[name] = date_str\n    return git_history_cache\n\n\ndef read_homepage_data():\n    homepage_data = {}\n    with open('CSrankings/homepage-validated.csv', 'r', encoding='utf-8') as f:\n        reader = csv.reader(f)\n        for row in reader:\n            if len(row) >= 2:\n                homepage_data[row[0]] = row[1]\n    return homepage_data\n\ndef read_author_info():\n    pub_counts = defaultdict(int)\n    author_conferences = defaultdict(set)\n    with open('CSrankings/generated-author-info.csv', 'r', encoding='utf-8') as f:\n        reader = csv.reader(f)\n        next(reader)  # Skip header\n        for row in reader:\n            if len(row) >= 4:\n                author_name = row[0].strip()\n                conference = row[2].strip()\n                try:\n                    count = float(row[3])\n                    pub_counts[author_name] += count\n                    author_conferences[author_name].add(conference)\n                except ValueError:\n                    continue\n    return pub_counts, author_conferences\n\ndef read_country_info():\n    country_data = {}\n    with open('CSrankings/country-info.csv', 'r', encoding='utf-8') as f:\n        reader = csv.reader(f)\n        for row in reader:\n            if len(row) >= 3:\n                institution = row[0].strip()\n                region = row[1].strip()\n                country = row[2].strip()\n                country_data[institution] = f\"{region}-{country}\"\n    return country_data\n\nasync def process_faculty_member(name, affiliation, homepage, homepage_data, pub_counts, conferences, git_history, country_data):\n    # Get the last modified date using git log\n    added_date = git_history.get(name, '')\n    \n    # Get publication count and conferences\n    pub_count = pub_counts.get(name, 0)\n    conf_list = ','.join(sorted(conferences.get(name, set())))\n    \n    # Get affiliation location\n    affiliation_location = country_data.get(affiliation, 'us-us')\n    \n    return {\n        'name': name,\n        'affiliation': affiliation,\n        'affiliation_location': affiliation_location,\n        'added_date': added_date,\n        'publication_count': pub_count,\n        'conferences': conf_list\n    }\n\nasync def process_faculty_files():\n    faculty_data = []\n    homepage_data = read_homepage_data()\n    pub_counts, author_conferences = read_author_info()\n    country_data = read_country_info()\n    \n    print(\"Loading git history...\")\n    git_history = load_git_history()\n    \n    # Process the combined csrankings.csv file\n    file_path = 'CSrankings/csrankings.csv'\n    with open(file_path, 'r', encoding='utf-8') as f:\n        reader = csv.reader(f)\n        next(reader)  # Skip header\n        rows = list(reader)\n    \n    # Create tasks for async processing\n    tasks = []\n    for row in rows:\n        if len(row) >= 3:\n            name = row[0].strip()\n            affiliation = row[1].strip()\n            homepage = row[2].strip()\n            tasks.append(process_faculty_member(name, affiliation, homepage, homepage_data, \n                                             pub_counts, author_conferences, git_history, country_data))\n    \n    print(f\"Processing {len(tasks)} faculty members...\")\n    \n    # Process faculty members concurrently\n    faculty_data = await asyncio.gather(*tasks)\n    \n    return faculty_data\n\ndef write_output(faculty_data):\n    with open('faculty_info.csv', 'w', newline='', encoding='utf-8') as f:\n        writer = csv.DictWriter(f, fieldnames=['name', 'affiliation', 'affiliation_location', \n                                             'added_date', 'publication_count', 'conferences'])\n        writer.writeheader()\n        writer.writerows(faculty_data)\n\nasync def main():\n    faculty_data = await process_faculty_files()\n    write_output(faculty_data)\n    print(f\"Processed {len(faculty_data)} faculty entries\")\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "find_recent_additions/requirements.txt",
    "content": "tqdm\npytest\n"
  }
]