Repository: neubig/research-career-tools
Branch: main
Commit: b2eaffe6642c
Files: 9
Total size: 13.8 KB

Directory structure:
gitextract_we871cek/

├── LICENSE
├── README.md
├── find_my_citers/
│   ├── README.md
│   ├── find_my_citers.py
│   └── requirements.txt
└── find_recent_additions/
    ├── .gitignore
    ├── README.md
    ├── find_recent_additions.py
    └── requirements.txt

================================================
FILE CONTENTS
================================================

================================================
FILE: LICENSE
================================================
Copyright 2024 Graham Neubig

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


================================================
FILE: README.md
================================================
# Research Career Tools 🔬

Organized by [Graham Neubig](http://www.phontron.com).

This is a set of scripts that may be helpful to people looking for a career in research.

* [Find My Citers](find_my_citers/): This script helps find people who have cited your papers, which can be useful to find recommendation letter writers for job or visa applications.


================================================
FILE: find_my_citers/README.md
================================================
# Find My Citers

by [Graham Neubig](http://www.phontron.com)

This is a script that helps find people who have cited your papers, which can be useful to find recommendation letter writers for job or visa applications.

It is based on the [Semantic Scholar API](https://api.semanticscholar.org/api-docs/).

## Usage

Install requirements, mostly `pys2`,
a python library for the [Semantic Scholar (S2) API](api.semanticscholar.org/).

```bash
pip install -r requirements.txt
```

Find your semantic scholar profile, and copy-paste the number from the URL. For example, if the URL is `https://www.semanticscholar.org/author/Graham-Neubig/1700325` then the number is `1700325`.

```bash
python find_my_citers.py --author_id AUTHOR_ID
```

Optionally, if you have a semantic scholar API key, you can add it to make the process go faster.

```bash
python find_my_citers.py --author_id AUTHOR_ID --s2_api_key API_KEY
```


================================================
FILE: find_my_citers/find_my_citers.py
================================================
import argparse
from collections import defaultdict, Counter
import csv
import time
import matplotlib.pyplot as plt
from semanticscholar import SemanticScholar
from tenacity import RetryError
from tqdm import tqdm

sch: SemanticScholar | None = None


def get_author_name(author_id: str) -> str:
    """Fetch the name of the author given the author ID."""
    author_details = sch.get_author(author_id)
    return author_details.name.replace(" ", "_")


def get_author_papers(author_id: str) -> list[dict]:
    """Fetch papers for a given author ID from Semantic Scholar."""
    author = sch.get_author(author_id)
    return [{"title": paper.title, "paperId": paper.paperId} for paper in author.papers]


def get_citations(paper_id: str) -> list[dict]:
    """Fetch citations for a given paper ID from Semantic Scholar."""
    paper_details = sch.get_paper(paper_id)
    return [
        {
            "title": citation.title,
            "paperId": citation.paperId,
            "year": citation.year,
            "authors": citation.authors,
        }
        for citation in paper_details.citations
    ]


def process_citations(papers: list[dict], citation_counts: defaultdict, citation_years: list, desc: str = "Papers") -> list[dict]:
    """Process citations for a list of papers, returning any that failed."""
    failed = []
    for paper in (pbar := tqdm(papers, desc=desc, unit="paper")):
        pbar.set_postfix_str(f"fetching: {paper['title'][:40]}")
        try:
            citations = get_citations(paper["paperId"])
        except RetryError:
            tqdm.write(f"Rate limit exceeded for '{paper['title']}', will retry.")
            failed.append(paper)
            time.sleep(10)
            continue
        for citation in tqdm(
            citations, desc=paper["title"][:50], unit="citation", leave=False
        ):
            for author in citation["authors"]:
                author_name = (
                    author.get("name") if isinstance(author, dict) else author.name
                )
                citation_counts[author_name] += 1
            if citation["year"] is not None:
                citation_years.append(citation["year"])
        time.sleep(1)
    return failed


def find_my_citers(author_id: str) -> list[tuple[str, int]]:
    your_paper_ids = get_author_papers(author_id)
    citation_counts = defaultdict(int)
    citation_years = []

    failed = process_citations(your_paper_ids, citation_counts, citation_years)
    if failed:
        tqdm.write(f"\nRetrying {len(failed)} failed paper(s) after a cooldown...")
        time.sleep(60)
        still_failed = process_citations(failed, citation_counts, citation_years, desc="Retrying")
        if still_failed:
            tqdm.write(f"Warning: {len(still_failed)} paper(s) permanently skipped due to rate limits: "
                       + ", ".join(p["title"] for p in still_failed))

    sorted_citation_counts = sorted(
        citation_counts.items(), key=lambda item: item[1], reverse=True
    )

    return sorted_citation_counts, citation_years


def export_citation_data(sorted_citation_counts, author_name):
    """Export citation data to a CSV file named after the author."""
    filename = f"{author_name}_citation_data.csv"
    with open(filename, "w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["Author", "Citation Count"])
        writer.writerows(sorted_citation_counts)
    print(f"Citation data exported to {filename}")
    return filename


def plot_citation_trends(citation_years, author_name):
    """Create and save a time-series plot of citation trends over time."""
    year_counts = Counter(citation_years)
    years = sorted(year_counts.keys())
    counts = [year_counts[year] for year in years]

    plt.figure(figsize=(10, 6))
    plt.plot(years, counts, marker="o")
    plt.title(f"Citation Trends Over Time for {author_name}")
    plt.xlabel("Year")
    plt.ylabel("Number of Citations")
    plt.tight_layout()
    plot_filename = f"{author_name}_citation_trends.png"
    plt.savefig(plot_filename)
    plt.close()
    print(f"Citation trend plot saved as {plot_filename}")
    return plot_filename


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Find authors who have cited your work the most using PyS2"
    )
    parser.add_argument(
        "--author_id",
        help=(
            "The author ID to search for. "
            "If not provided, the script will prompt for input."
        ),
        default=None,
    )
    parser.add_argument(
        "--s2_api_key",
        type=str,
        default=None,
        help="An API key for semantic scholar if you have one.",
    )

    args = parser.parse_args()
    sch = SemanticScholar(api_key=args.s2_api_key)

    if args.author_id is None:
        author_id = input("Enter the author ID: ")
    else:
        author_id = args.author_id

    author_name = get_author_name(author_id)
    sorted_citation_counts, citation_years = find_my_citers(author_id)
    csv_filename = export_citation_data(sorted_citation_counts, author_name)
    plot_filename = plot_citation_trends(citation_years, author_name)


================================================
FILE: find_my_citers/requirements.txt
================================================
semanticscholar
matplotlib
tqdm


================================================
FILE: find_recent_additions/.gitignore
================================================
CSrankings/
faculty_info.csv
__pycache__/
.pytest_cache/

================================================
FILE: find_recent_additions/README.md
================================================
# CSRankings Recent Additions Tracker

This script analyzes the CSRankings database to track recent additions to university faculty members, including their publication counts and research areas.

## Description

The script processes the CSRankings database to extract:

- Faculty member names and affiliations
- When they were added to the database
- Their research areas
- Publication counts

## Prerequisites

- Python 3.6+
- Git

## Installation

First, clone the CSRankings repository (required for data):

```bash
git clone https://github.com/emeryberger/CSrankings.git
```

Second, install required Python packages:

```bash
pip install -r requirements.txt
```

## Usage

First, make sure the CSRankings repository is in the same directory as the script then run it:

```bash
python find_recent_additions.py
```

The script will generate a `faculty_info.csv` file containing:

- Name
- Affiliation
- Research areas
- Date added to CSRankings
- Publication count

## Output Format

The output CSV file contains the following columns:

- `name`: Faculty member's name
- `affiliation`: University affiliation
- `areas`: Research areas
- `added_date`: Date when added to CSRankings
- `publication_count`: Number of publications

## Performance

The script uses multiprocessing to parallelize faculty data processing, utilizing all available CPU cores for improved performance.

## Dependencies

- `csv`: For CSV file handling
- `multiprocessing`: For parallel processing
- `subprocess`: For Git operations
- `tqdm`: For progress bars

## Notes

- The script requires the CSRankings repository to be up-to-date for accurate addition dates
- Git blame is used to determine when entries were added to the database


================================================
FILE: find_recent_additions/find_recent_additions.py
================================================
import csv
import subprocess
from collections import defaultdict
from datetime import datetime
from tqdm import tqdm
import re
import asyncio
from asyncio import Queue

def load_git_history():
    git_history_cache = {}
    # Get git blame information
    cmd = ['git', 'blame', '--date=iso', 'csrankings.csv']
    blame_output = subprocess.check_output(cmd, cwd='CSrankings', text=True)
    
    # Regex pattern to match the date format in git blame output
    date_pattern = r'\((.*?)\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\s+[+-]\d{4})'
    
    for line in blame_output.split('\n'):
        if not line:
            continue
        match = re.search(date_pattern, line)
        if match and len(line.split(')')) > 1:
            date_str = match.group(2)
            fields = line.split(')')[-1].strip().split(',')
            if len(fields) >= 1:
                name = fields[0].strip()
                git_history_cache[name] = date_str
    return git_history_cache


def read_homepage_data():
    homepage_data = {}
    with open('CSrankings/homepage-validated.csv', 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) >= 2:
                homepage_data[row[0]] = row[1]
    return homepage_data

def read_author_info():
    pub_counts = defaultdict(int)
    author_conferences = defaultdict(set)
    with open('CSrankings/generated-author-info.csv', 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader)  # Skip header
        for row in reader:
            if len(row) >= 4:
                author_name = row[0].strip()
                conference = row[2].strip()
                try:
                    count = float(row[3])
                    pub_counts[author_name] += count
                    author_conferences[author_name].add(conference)
                except ValueError:
                    continue
    return pub_counts, author_conferences

def read_country_info():
    country_data = {}
    with open('CSrankings/country-info.csv', 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) >= 3:
                institution = row[0].strip()
                region = row[1].strip()
                country = row[2].strip()
                country_data[institution] = f"{region}-{country}"
    return country_data

async def process_faculty_member(name, affiliation, homepage, homepage_data, pub_counts, conferences, git_history, country_data):
    # Get the last modified date using git log
    added_date = git_history.get(name, '')
    
    # Get publication count and conferences
    pub_count = pub_counts.get(name, 0)
    conf_list = ','.join(sorted(conferences.get(name, set())))
    
    # Get affiliation location
    affiliation_location = country_data.get(affiliation, 'us-us')
    
    return {
        'name': name,
        'affiliation': affiliation,
        'affiliation_location': affiliation_location,
        'added_date': added_date,
        'publication_count': pub_count,
        'conferences': conf_list
    }

async def process_faculty_files():
    faculty_data = []
    homepage_data = read_homepage_data()
    pub_counts, author_conferences = read_author_info()
    country_data = read_country_info()
    
    print("Loading git history...")
    git_history = load_git_history()
    
    # Process the combined csrankings.csv file
    file_path = 'CSrankings/csrankings.csv'
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader)  # Skip header
        rows = list(reader)
    
    # Create tasks for async processing
    tasks = []
    for row in rows:
        if len(row) >= 3:
            name = row[0].strip()
            affiliation = row[1].strip()
            homepage = row[2].strip()
            tasks.append(process_faculty_member(name, affiliation, homepage, homepage_data, 
                                             pub_counts, author_conferences, git_history, country_data))
    
    print(f"Processing {len(tasks)} faculty members...")
    
    # Process faculty members concurrently
    faculty_data = await asyncio.gather(*tasks)
    
    return faculty_data

def write_output(faculty_data):
    with open('faculty_info.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['name', 'affiliation', 'affiliation_location', 
                                             'added_date', 'publication_count', 'conferences'])
        writer.writeheader()
        writer.writerows(faculty_data)

async def main():
    faculty_data = await process_faculty_files()
    write_output(faculty_data)
    print(f"Processed {len(faculty_data)} faculty entries")

if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: find_recent_additions/requirements.txt
================================================
tqdm
pytest