Repository: edsu/twarc Branch: main Commit: 12104e080f48 Files: 89 Total size: 595.5 KB Directory structure: gitextract_5vocmduc/ ├── .gitignore ├── .readthedocs.yaml ├── LICENSE ├── MANIFEST.in ├── README.md ├── RELEASING.md ├── docs/ │ ├── README.md │ ├── api/ │ │ ├── client.md │ │ ├── client2.md │ │ ├── expansions.md │ │ └── library.md │ ├── plugins.md │ ├── resources.md │ ├── tutorial.md │ ├── twarc1_en_us.md │ ├── twarc1_es_mx.md │ ├── twarc1_ja_jp.md │ ├── twarc1_pt_br.md │ ├── twarc1_sv_se.md │ ├── twarc1_sw_ke.md │ ├── twarc1_zw_zh.md │ ├── twarc2_en_us.md │ ├── twitter-developer-access.md │ └── windows10.md ├── mkdocs.yml ├── pyproject.toml ├── requirements-mkdocs.txt ├── setup.cfg ├── src/ │ └── twarc/ │ ├── __init__.py │ ├── __main__.py │ ├── client.py │ ├── client2.py │ ├── command.py │ ├── command2.py │ ├── config.py │ ├── decorators.py │ ├── decorators2.py │ ├── expansions.py │ ├── handshake.py │ ├── json2csv.py │ └── version.py ├── test_twarc.py ├── test_twarc2.py └── utils/ ├── auth_timing.py ├── deduplicate.py ├── deleted.py ├── deleted_users.py ├── deletes.py ├── embeds.py ├── emojis.py ├── extractor.py ├── filter_date.py ├── filter_users.py ├── flakey.py ├── foaf.py ├── gender.py ├── geo.py ├── geofilter.py ├── geojson.py ├── json2csv.py ├── media2warc.py ├── media_urls.py ├── network.py ├── noretweets.py ├── oembeds.py ├── remove_limit.py ├── retweets.py ├── search.py ├── sensitive.py ├── sort_by_id.py ├── source.py ├── tags.py ├── times.py ├── twarc-archive.py ├── tweet.py ├── tweet_compliance.py ├── tweet_text.py ├── tweet_urls.py ├── tweetometer.py ├── tweets.py ├── unshrtn.py ├── urls.py ├── users.py ├── validate.py ├── wall.py ├── wayback.py ├── webarchives.py ├── wordcloud.py └── youtubedl.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.pyc *.log .cache .venv .eggs Pipfile* build dist twarc.egg-info .pytest_cache .vscode .env site uv.lock ================================================ FILE: .readthedocs.yaml ================================================ version: 2 mkdocs: configuration: mkdocs.yml python: version: 3.8 install: - requirements: requirements-mkdocs.txt ================================================ FILE: LICENSE ================================================ The MIT License (MIT) Copyright (c) Documenting the Now Project Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MANIFEST.in ================================================ include requirements.txt include docs/README.md ================================================ FILE: README.md ================================================ # twarc **Note: twarc is no longer actively supported after changes to Twitter's API quotas made it unusable.** --- [![DOI](https://zenodo.org/badge/7605723.svg)](https://zenodo.org/badge/latestdoi/7605723) twarc is a command line tool and Python library for collecting and archiving Twitter JSON data via the Twitter API. It has separate commands (twarc and twarc2) for working with the older v1.1 API and the newer v2 API and Academic Access (respectively). * Read the [documentation](https://twarc-project.readthedocs.io) * Ask questions here in [GitHub](https://github.com/DocNow/twarc/discussions), in [Slack](https://bit.ly/docnow-slack) or [Matrix](https://matrix.to/#/#docnow:matrix.org?via=matrix.org&via=petrichor.me&via=converser.eu) twarc has been developed with generous support from the [Mellon Foundation](https://mellon.org/). ## Contributing New features are welcome and encouraged for twarc. However, to keep the core twarc library and command line tool sustainable we will look at new functionality with the following principles in mind: 1. Purpose: twarc is for *collection* and *archiving* of Twitter data via the Twitter API. 2. Sustainability: keeping the surface area of twarc and it's dependencies small enough to ensure high quality. 3. Utility: what is exposed by twarc should be applicable to different people, projects and domains, and not specific use cases. 4. API consistency: as much as sensible we aim to make twarc consistent with the Twitter API, and also aim to make twarc consistent with itself - so commands in core twarc should work similarly to each other, and twarc functionality should align towards the Twitter API. For features and approaches that fall outside of this, twarc enables external packages to hook into the twarc2 command line tool via [click-plugins](https://github.com/click-contrib/click-plugins). This means that if you want to propose new functionality, you can create your own package without coordinating with core twarc. ### Documentation The documentation is managed at ReadTheDocs. If you would like to improve the documentation you can edit the Markdown files in `docs` or add new ones. Then send a pull request and we can add it. To view your documentation locally you should be able to: pip install -r requirements-mkdocs.txt pip install -e . mkdocs serve open http://127.0.0.1:8000/ If you prefer you can create a page on the [wiki](https://github.com/docnow/twarc/wiki/) to workshop the documentation, and then when/if you think it's ready to be merged with the documentation create an [issue](https://github.com/docnow/twarc/issues). Please feel free to create whatever documentation is useful in the wiki area. ### Code If you are interested in adding functionality to twarc or fixing something that's broken here are the steps to setting up your development environment: git clone https://github.com/docnow/twarc cd twarc Create a .env file that included Twitter App keys to use during testing: BEARER_TOKEN=CHANGEME CONSUMER_KEY=CHANGEME CONSUMER_SECRET=CHANGEME ACCESS_TOKEN=CHANGEME ACCESS_TOKEN_SECRET=CHANGEME Now run the tests: uv run pytest Add your code and some new tests, and send a pull request! ================================================ FILE: RELEASING.md ================================================ # Releasing New versions of twarc can be released by creating a release and assigning a new tag in the GitHub repo. The release, including upload of the new version to PyPI, is performed by GitHub actions when a new tag is created, using the PyPI token stored in the secrets associated with the repository. Anybody who has the permission to create a tag can perform a release. Steps in a release: 1. Update the version number in `twarc/version.py` - the format is MAJOR.MINOR.PATCH and should always be increasing and unique. 2. Make a new release from https://github.com/DocNow/twarc/releases (hit the 'draft new release' button on the top right). 3. Create a new tag, matching the version number in `twarc/version.py`, with a v prefix (ie. vMAJOR.MINOR.PATCH) 4. Write release notes. 5. Publish the release. 6. Make sure the GitHub action completes successfully. 7. Double check that the new version correctly installs from PyPI: `pip install --upgrade twarc` should install the new version created above. ================================================ FILE: docs/README.md ================================================ # twarc twarc is a command line tool and Python library for collecting and archiving Twitter JSON data via the Twitter API. It has separate commands (twarc and twarc2) for working with the older v1.1 API and the newer v2 API and Academic Access (respectively). It also has an ecosystem of [plugins](plugins) for doing things with the collected data. See the `twarc` documentation for running commands: [twarc2](twarc2_en_us.md) and [twarc1](twarc2_en_us.md) for using the v1.1 API. If you aren't sure about which one to use you'll want to start with twarc2 since the v1.1 is scheduled to be retired. ## Install If you have python installed, you can install twarc from a terminal (such as the Windows Command Prompt available in the "start" menu, or the [OSX Terminal application](https://support.apple.com/en-au/guide/terminal/apd5265185d-f365-44cb-8b09-71a064a42125/mac)): ``` pip3 install twarc ``` Once installed, you should be able to use the twarc and twarc2 command line utilities, or use it as a Python library - check the examples [here](api/library.md) for that. ## Other Tools Twarc is purpose build for working with the twitter API for archiving and studying digital trace data. It is not built as a general purpose API library for Twitter. While the primary use is academic, it works just as well with "Standard" v2 API and "Premium" v1.1 APIs. For a list of general purpose Twitter Libraries in different languages see the [Twitter Documentation](https://developer.twitter.com/en/docs/twitter-api/tools-and-libraries). For Python, [TwitterAPI](https://github.com/geduldig/TwitterAPI) and [tweepy](https://github.com/tweepy/tweepy) are both up to date and maintained. They also support v2 APIs, and their data format with expansions may differ from twarc. There is also a reference implementation of the [v2 Academic Access Search](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all) and [v1.1 Premium Search](https://developer.twitter.com/en/docs/twitter-api/premium/search-api/overview) from Twitter [here](https://github.com/twitterdev/search-tweets-python/). The [v2 version](https://github.com/twitterdev/search-tweets-python/tree/v2) of this script is compatible with twarc. For `R` there is [academictwitteR](https://cran.r-project.org/web/packages/academictwitteR/vignettes/academictwitteR-intro.html). Unlike twarc, it focuses solely on querying the Twitter Academic Research Product Track v2 API endpoint. Data gathered in twarc can be imported into `R` for analysis as a dataframe if you export the data into CSV using [twarc-csv](https://pypi.org/project/twarc-csv/). ## Getting Help Check out the [tutorial](tutorial.md) to get started, or follow along with this [recorded stream](https://tube.nocturlab.fr/videos/watch/1d98d20e-a4fd-4594-aa94-9b1b1301cead) introducing twarc. You can also find additional resources linked from [resources](resources.md). If you run into trouble, feel free to make a post on the [Twarc Repository](https://github.com/DocNow/twarc/issues) or on the [Twitter Developer Forums](https://twittercommunity.com/c/academic-research/62). ================================================ FILE: docs/api/client.md ================================================ # twarc.Client ::: twarc.client handler: python ================================================ FILE: docs/api/client2.md ================================================ # twarc.Client2 ::: twarc.client2 handler: python ================================================ FILE: docs/api/expansions.md ================================================ # twarc.expansions [Expansions](https://developer.twitter.com/en/docs/twitter-api/expansions) are how the new v2 Twitter API includes optional metadata about Tweets. In contrast to v1.1, where each Tweet JSON object is self-contained, in v2 metadata about a whole "page" of requests is included in the response. This means that to get a self-contained Tweet JSON, additional processing is needed to look up each piece of extra metadata. Different tools and libraries may implement this in different ways. In twarc, the goal was to retain the original JSON format and only append extra fields, so that any code that expects original JSON will still work. ::: twarc.expansions handler: python ================================================ FILE: docs/api/library.md ================================================ # Examples of using twarc2 as a library Please see [client2](client2.md) docs for the full list of available functions. Here are some minimal working snippets of code that use twarc2 as a library. ## Search The client implements the API as closely as possible - so if the API docs expect a parameter in a certain way, so does the twarc2 library. ```python import datetime from twarc.client2 import Twarc2 from twarc.expansions import ensure_flattened # Your bearer token here t = Twarc2(bearer_token="A...z") # Start and end times must be in UTC start_time = datetime.datetime(2021, 3, 21, 0, 0, 0, 0, datetime.timezone.utc) end_time = datetime.datetime(2021, 3, 22, 0, 0, 0, 0, datetime.timezone.utc) # search_results is a generator, max_results is max tweets per page, 100 max for full archive search with all expansions. search_results = t.search_all(query="dogs lang:en -is:retweet", start_time=start_time, end_time=end_time, max_results=100) # Get all results page by page: for page in search_results: # Do something with the whole page of results: # print(page) # or alternatively, "flatten" results returning 1 tweet at a time, with expansions inline: for tweet in ensure_flattened(page): # Do something with the tweet print(tweet) # Stop iteration prematurely, to only get 1 page of results. break ``` ## Working with Generators Twarc will try to retrieve all available results and handle retries and rate limits for you. This can potentially retrieve more tweets than your monthly limit will allow. The command line interface has a `--limit` option, but the library returns generator functions and it is upto you to stop iterating when you have retrieved enough results. For example, to only get 2 "pages" of followers max per user: ```python from twarc.client2 import Twarc2 # Your bearer token here t = Twarc2(bearer_token="A...z") user_ids = [12, 2244994945, 4503599627370241] # @jack, @twitterdev, @overflow64 # Iterate over our target users for user_id in user_ids: # Iterate over pages of followers for i, follower_page in enumerate(t.followers(user_id)): # Do something with the follower_page here print(f"Fetched a page of {len(follower_page['data'])} followers for {user_id}") if i == 1: # Only retrieve the first two pages (enumerate starts from 0) break ``` ## twarc CSV `twarc-csv` is an extra plugin you can install: ``` pip install twarc-csv ``` This can also be used as a library, for example: If you have a bunch of data, and want a DataFrame: ``` from twarc_csv import DataFrameConverter # Default options for Dataframe converter converter = DataFrameConverter() # this can be a list or generator of individual tweets or pages or results. json_objects = [...] df = converter.process(json_objects) ``` This doesn't save any files, and converts everything in memory. If you have a large file, you should use `CSVConverter` as before ``` from twarc_csv import CSVConverter with open("input.json", "r") as infile: with open("output.csv", "w") as outfile: converter = CSVConverter(infile=infile, outfile=outfile) converter.process() ``` or with additional options: ``` from twarc_csv import CSVConverter, DataFrameConverter converter = DataFrameConverter( input_data_type="tweets", json_encode_all=False, json_encode_text=False, json_encode_lists=True, inline_referenced_tweets=True, merge_retweets=True, allow_duplicates=False, ) with open("results.jsonl", "r") as infile: with open("results.csv", "w") as outfile: converter = CSVConverter(infile=infile, outfile=outfile, converter=converter) converter.process() ``` `DataFrameConverter` parameters correspond to the command line options: https://github.com/DocNow/twarc-csv#extra-command-line-options The full list of valid `output_columns` are: https://github.com/DocNow/twarc-csv/blob/main/dataframe_converter.py#L13-L85 when using `input_data_type="tweets"` and https://github.com/DocNow/twarc-csv/blob/main/dataframe_converter.py#L90-L115 when using `input_data_type="users"`. Note that it won't extract users from tweets, these have to be already extracted from the JSON. `twarc-csv` can also process compliance output and counts output. ## Search and write results to CSV example Here is a complete working example that searches for all recent tweets in the last few hours, writes a `results.jsonl` with the original responses, and then converts this to CSV: ```python import json from datetime import datetime, timezone, timedelta from twarc.client2 import Twarc2 from twarc_csv import CSVConverter # Your bearer token here t = Twarc2(bearer_token="A...z") # Start and end times must be in UTC start_time = datetime.now(timezone.utc) + timedelta(hours=-3) # end_time cannot be immediately now, has to be at least 30 seconds ago. end_time = datetime.now(timezone.utc) + timedelta(minutes=-1) query = "dogs lang:en -is:retweet has:media" print(f"Searching for \"{query}\" tweets from {start_time} to {end_time}...") # search_results is a generator, max_results is max tweets per page, not total, 100 is max when using all expansions. search_results = t.search_recent(query=query, start_time=start_time, end_time=end_time, max_results=100) # Get all results page by page: for page in search_results: # Do something with the page of results: with open("dogs_results.jsonl", "w+") as f: f.write(json.dumps(page) + "\n") print("Wrote a page of results...") print("Converting to CSV...") # This assumes `results.jsonl` is finished writing. with open("dogs_results.jsonl", "r") as infile: with open("dogs_output.csv", "w") as outfile: converter = CSVConverter(infile, outfile) converter.process() print("Finished.") ``` ================================================ FILE: docs/plugins.md ================================================ # Plugins twarc v1 collected a set of utilities for working with tweet json in the [utils] directory of the git repository. This was a handy way to develop and share snippets of code. But some utilities had different dependencies which weren't managed in a uniform way. Some of the utilities had slightly different interfaces. They needed to be downloaded from GitHub manually and weren't easily accessible at the command line if you remembered where you put them. With *twarc2* these utilities are now installable as plugins, which are made available as subcommands using the same twarc2 command line. Plugins are published separately from twarc on [PyPI] and are installed with [pip]. Here is a list of some known plugins (if you write one please [let us know] so we can add it to this list): * [twarc-ids](https://pypi.org/project/twarc-ids/): a simple example of printing the ids for tweets to use as a reference for creating plugins * [twarc-csv](https://pypi.org/project/twarc-csv/): export tweets to CSV, which is probably the first thing a researcher will want to do * [twarc-videos](https://pypi.org/project/twarc-videos): extract videos from tweets * [twarc-network](https://pypi.org/project/twarc-network): visualize tweets and users as a network graph * [twarc-timeline-archive](https://pypi.org/project/twarc-timeline-archive): routinely download tweet timelines for a list of users * [twarc-hashtags](https://pypi.org/project/twarc-hashtags): create a report of hashtags that are used in collected tweet data * Write your own, and [let us know] so we can add it here! ## Writing a Plugin The [twarc-ids] plugin provides an example of how to write plugins. This reference plugin simply reads collected tweet JSON data and writes out the tweet identifiers. First you install the plugin: pip install twarc-ids and then you use it: twarc2 ids tweets.json > ids.txt Internally twarc's command line is implemented using the [click] library. The [click-plugins] module is what manages twarc2 plugins. Basically you import `click` and implement your plugin as you would any other click utility, for example: ```python import json import click @click.command() @click.argument('infile', type=click.File('r'), default='-') @click.argument('outfile', type=click.File('w'), default='-') def ids(infile, outfile): """ Extract tweet ids from tweet JSON. """ for line in infile: tweet = json.loads(line) click.echo(t['data']['id'], file=outfile) ``` Note that the plugin takes input file *infile* and writes to an output file *outfile* which default to stdin and stdout respectively. This allows plugin utilities to be used as part of pipelines. You can add options using the standard facilities that click provides if your plugin needs them. If your plugin needs to talk to the Twitter API then just add the `@click.pass_obj` decorator which will ensure that the first parameter in your function will be a Twarc2 client that is configured to use the client's keys. ```python @click.command() @click.argument('infile', type=click.File('r'), default='-') @click.argument('outfile', type=click.File('w'), default='-') @click.pass_obj def ids(twarc_client, infile, outfile): # do something with the twarc client here ``` Finally you just need to create a `setup.py` file for your project that looks something like this: ```python import setuptools setuptools.setup( name='twarc-ids', version='0.0.1', url='https://github.com/docnow/twarc-ids', author='Ed Summers', author_email='ehs@pobox.com', py_modules=['twarc_ids'], description='A twarc plugin to read Twitter data and output the tweet ids', install_requires=['twarc'], setup_requires=['pytest-runner'], tests_require=['pytest'], entry_points=''' [twarc.plugins] ids=twarc_ids:ids ''' ) ``` The key part here is the `entry_points` section which is what allows twarc2 to discover twarc.plugins dynamically at runtime, and also defines how the subcommand maps to the plugin's function. It's good practice to include a test or two for your plugin to ensure it works over time. Check out the example [here] for how to test command line utilities easily with click. To publish your plugin on PyPi: ``` pip install twine python setup.py sdist twine upload dist/* # enter pypi login details ``` [twarc-ids]: https://github.com/docnow/twarc-ids/ [PyPI]: https://python.org/pypi/ [pip]: https://pip.pypa.io/en/stable/ [click]: https://click.palletsprojects.com/ [click-plugins]: https://github.com/click-contrib/click-plugins [here]: https://github.com/DocNow/twarc-ids/blob/main/test_twarc_ids.py [let us know]: https://github.com/docnow/twarc/issues/ [utils]: https://github.com/DocNow/twarc/tree/main/utils ================================================ FILE: docs/resources.md ================================================ # Twarc Tutorials and Other Resources Documentation here is largely auto generated from the code, which may not always be the most user friendly. Others have written great tutorials and other resources relating to using twarc, or working with the data generated by twarc. If you'd like to suggest additional resources that are relevant, please feel to open a pull request or open an issue. ## An Introductory Video from the Australian Digital Observatory A [six minute video](https://www.youtube.com/watch?v=4DXEeM2AA9Y) by the [Australian Digital Observatory](https://www.digitalobservatory.net.au/) that shows some of the functionality of `twarc2` search, as well as how to use [Twitter's Query Builder](https://developer.twitter.com/apitools/query?query=) in conjunction with twarc. ## Carpentries Lesson Includes a step by step guide to collecting Twitter data using `twarc2`. It includes information on Twitter's JSON format, and how to manage collected data. ## UVA Library's Scholars' Lab Twarc Tutorial A beginner guide that also goes through command line and Python setup. Uses `twarc` for v1.1 API examples, not `twarc2`. ## Guide from TwitterDev Twitter have released a 101 guide on using the Academic Access endpoints. It uses `twarc2` as a library as opposed to command line, and gives code examples in R too. ## Twitter Data Collection & Analysis Lesson from Introduction to Cultural Analytics & Python ## Getting Data from Twitter: A twarc tutorial Uses `twarc` for `v1.1` endpoints and has step by step examples for using some of the `/utils` scripts. ## UCSB Library Twarc Tutorials Uses both `twarc` and `twarc2` ## Introduction to full archive searching using twarc v2 An example of using `twarc2` search, but be sure to install twarc using `pip install twarc` not the link to the v2 branch zip. ================================================ FILE: docs/tutorial.md ================================================ # Twarc Tutorial Twarc is a command line tool for collecting Twitter data via Twitter's web Application Programming Interface (API). This tutorial is aimed at researchers who are new to collecting social media data, and who might be unfamiliar with command line interfaces. By the end of this tutorial, you will have: 1. Familiarised yourself with interacting with a command line application via a terminal 2. Setup Twarc so you can collect data from the Twitter API (version 2) 3. Constructed two Twitter search queries to address a specific research question 4. Collected data for those two queries 5. Processed the collected data into formats suitable for other analysis 6. Performed a simple quantitative comparison of the two collections using Python 7. Prepared a dataset of tweet identifiers that can be shared with other researchers ## Motivating example This tutorial is built around collecting data from Twitter to address the following research question: ***Which monotreme is currently the coolest - the echidna or the platypus?*** We'll answer this question with a simple quantitative approach to analysing the collected data: counting the volume of likes that tweets mentioning each species of animal accrue. For this tutorial, the species that gets the most likes on tweets is going to be considered the "coolest". This is a very simplistic quantitative approach, just to get you started on collecting and analysing Twitter data. To seriously study the relative coolness of monotremes, there are a wide variety of more appropriate (but also more involved) methods. ## Introduction to twarc and the Twitter API ### What is an API? An **Application Programming Interface** (API) is a common method for software applications and services to allow other systems or people to programmatically interact with them. For example, Twitter has an API which allows external systems to make requests to Twitter for information or actions. Twitter (and many other web apps and services) uses an HTTP REST API, meaning that to interact with Twitter through the API you can send an HTTP request to a specific URL provided by Twitter. Twitter affords many different URLs (also known as **endpoints**) which have been designed for different purposes (more about that later). Assuming that your HTTP request is valid, Twitter will respond with a bundle of information in [JSON format](https://en.wikipedia.org/wiki/JSON) for you. Twarc acts as a tool or an intermediary for you to interact with the Twitter API, so that you don't have to manage the details of how exactly to make requests to the Twitter API and handle Twitter's responses. Twarc commands correspond roughly with Twitter API endpoints. For example, when you use Twarc to fetch the timeline of a specific Twitter account (we'll use @Twitter in this example), this is the sequence of events: 1. You run `twarc2 timeline Twitter tweets.jsonl` 2. twarc2 makes a request on your behalf to the [Twitter v2 user lookup API endpoint](https://developer.twitter.com/en/docs/twitter-api/users/lookup/introduction) in order to find the user ID for the @Twitter account, and receives a response from the Twitter API server with that user ID 3. twarc2 makes a request on your behalf to the [Twitter v2 timeline API endpoint](https://developer.twitter.com/en/docs/twitter-api/tweets/timelines/introduction), using the user ID determined in step 2, and receives a response (or several responses) from the Twitter API server with @Twitter's tweets 4. twarc2 consolidates the timeline responses from step 3 and outputs them according to your initial command, in this case as `tweets.jsonl` There are a great many resources on the internet to learn more about APIs more generally and how to use them in a variety of contexts. Here are a few introductory articles: - [How to Geek: What is an API, and how do developers use them?](https://www.howtogeek.com/343877/what-is-an-api/) - [IBM: What is an API?](https://www.ibm.com/cloud/learn/api) More detailed information on APIs and working with them: - [Zapier: An introduction to APIs](https://zapier.com/learn/apis/) - [RealPython: Python and REST APIs: Interacting with web services](https://realpython.com/api-integration-in-python/) ### What can you do with the Twitter API? The Twitter API is very popular in academic communities for good reason: it is one of the most accessible and research-friendly of the popular social media platforms at present. The Twitter API is well-established and offers a broad range of possibilities for data collection. Here are some examples of things you can do with the Twitter API: - Find historical tweets containing words or phrases during a time window of interest - Collect live tweets as they are posted matching specific search criteria - Collect tweets using specific hashtags or mentioning particular users - Collect tweets made by a particular user account - Collect engagement metrics including likes and retweets for specific tweets of interest - Map Twitter account followers and followees within or around a group of users - Trace conversations and interactions around users or tweets of interest You may notice as you read about the Twitter API that there are two versions of the Twitter API - version 1.1 and version 2. At the time of writing, Twitter is providing both versions of the API, but at some unknown point in the future version 1.1 may be discontinued. Twarc can handle either API version: the `twarc` command uses version 1.1 of the Twitter API, the `twarc2` command uses version 2. Take care when reading documentation and tutorials as to which Twitter API version is being referenced. **This tutorial uses version 2 of the Twitter API**. Twitter API endpoints can be structured either around tweets or around user accounts. For example, the search endpoint provides lists of tweets - user information is included, but the data is focused on the tweets. The available endpoints and their details are evolving as Twitter develops and releases its API version 2, so for the most up to date information refer to [the Twitter API documentation](https://developer.twitter.com/en/docs/twitter-api). Some of the most used endpoints for research purposes are: - [search](https://developer.twitter.com/en/docs/twitter-api/tweets/search/introduction): This is the endpoint used to search tweets, whether recent or historical. - [lookup](https://developer.twitter.com/en/docs/twitter-api/tweets/lookup/introduction): The lookup endpoints are useful when you have IDs of tweets of interest and want to fetch further data about those tweets - known in the Twarc community as **hydrating** the tweets. - [follows](https://developer.twitter.com/en/docs/twitter-api/users/follows/introduction): The follows endpoint allows collecting information about who follows who on Twitter. With the Twitter API, you can get data related to all types of objects that make up the Twitter experience, including [tweets](https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/tweet) and [users](https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/user). The Twitter documentation provides full details, and these two pages are very useful to bookmark! The Twitter documentation also provides some useful tools for constructing searches and queries: - [Twitter's v2 API Query Builder](https://developer.twitter.com/apitools/query?query=) - [Building high quality filters for getting Twitter data](https://developer.twitter.com/en/docs/tutorials/building-high-quality-filters) The rest of this tutorial is going to focus on using the Twitter search API endpoint to retrieve tweets containing content relevant to the research question. We've chosen to focus on this because: 1. With the rich functionality available in the search API the data collection for many projects can be condensed down to a few carefully chosen searches. 2. With [academic research access](https://developer.twitter.com/en/products/twitter-api/academic-research) it's possible to search the entire Twitter archive, making search uniquely powerful among the endpoints Twitter supports. ### Introduction to twarc Twarc is at its core an application for interacting with the Twitter API, reading results from the different functionality the API offers, and safely writing the collected data to your machine for further analysis. Twarc handles the mechanical details of interacting with the Twitter API like including information to authenticate yourself, making HTTP requests to the API, formatting data in the right way, and retrying when things on the internet fail. Your job is to work out: 1. Which endpoint you want to call on from the Twitter API. 2. Which data you want to retrieve from that endpoint. Twarc is a command line based application - to use twarc you type a command specifying a particular action, and the results of that command are shown as text on screen. If you haven't used a command line interface before, don't worry! Although there is a bit of a learning curve at the beginning, you will quickly get the hang of it - and because everything is a typed command, it is very easy to record and share _exactly_ how you collected data with other people. ## Considerations when using social media data for research Before we dive into the details, it's worth mentioning some broader issues you will need to keep in mind when working with social media data. This is by no means an exhaustive list of issues and is intended as a starting point for further enquiry. ### Ethical use of "public" communication Even though most tweets on Twitter are public, in that they're accessible to anyone on the web, most users of Twitter don't have any expectation that researchers will be reading their tweets for the purpose of research. Researchers need to be mindful of this when working with data from Twitter, and user expectations should be considered as part of the study design. The Association of Internet Researchers has established [Ethical Guidelines for Internet Research](https://aoir.org/ethics/) which are a good starting point for the higher level considerations. Work has also been done specifically looking at [Twitter users' expectations](https://journals.sagepub.com/doi/10.1177/2056305118763366), with a number of key concerns outlined. For this tutorial we're going to be taking a high level quantitative evaluation of very recent Twitter data, which distances ourselves from the specific tweets and users creating them and aligns with these broader ethical considerations. Finally, because tweets (and the internet more generally) are searchable, we need to keep in mind that quoting a tweet in whole or part might allow easy reidentification of any specific user or tweet. For this reason care needs to be taken when reporting material from tweets, and common practices in qualitative research may not align with Twitter users' interests or expectations. ### Copyright This may vary according to where you are in the world but tweets, including the text of the tweet and attached photos and videos are likely to be protected by copyright. As well as the Twitter Developer Agreement considerations in the next section, this may limit what you can do with tweets and media downloaded from Twitter. ### Twitter's terms of service When you signed up for a Twitter developer account you agreed to follow Twitter's [Developer Agreement and Policy](https://developer.twitter.com/en/developer-terms/agreement-and-policy). This agreement constrains how you can use and share Twitter data. While the primary purpose of this agreement is to protect Twitter the company, this policy also incorporates some elements aimed at protecting users of Twitter. Some particular things to note from the Developer Agreement are: - Limits on how geolocation data can be used - How to share Twitter data - Dealing with deleted tweets Note that researchers using deleted tweets were also key concerns for [Twitter users](https://journals.sagepub.com/doi/10.1177/2056305118763366). This tutorial won't cover geolocation data at all, but will cover approaches to sharing Twitter data and removing deleted material from collections. ## Setup Twarc is a command line application, written in the Python programming language. To get Twarc running on our machines, we're going to need to install Python, then install Twarc itself, and we will also need to setup a Twitter developer account. ### Twitter developer access [Start here](https://developer.twitter.com/en/apply-for-access) to apply for a Twitter developer account and follow the steps in [our developer access guide](twitter-developer-access.md). For this tutorial, you can skip step 2, as we won't require academic access. Once you have the **Bearer Token**, you are ready for the next step. This token is like a password, so you shouldn't share it with other people. You will also need to be able to enter this token once to configure Twarc, so it would be best to copy and paste it to a text file on your local machine until we've finished configuration. ### Install Python #### Windows Install the latest version [for Windows](https://www.python.org/downloads/windows/). During the installation, make sure the *Add Python to PATH* option is selected/ticked. ![](images/win_installer.png) #### Mac Install the latest version [for Mac](https://www.python.org/downloads/macos/). No additional setup should be necessary for Python. ### Install Twarc and other utilities For this tutorial we're going to install three Python packages, `twarc`, an extension called `twarc-csv`, and `pandas`, a Python library for data analysis. We will use a command line interface to install these packages. On Windows we will use the `cmd` console, which can be found by searching for `cmd` from the start menu - you should see a prompt like the below screenshot. On Mac you can open the `Terminal` app. ![Screenshot showing the opening of the cmd window on windows](images/CMD.png) Once you have a terminal open we can run the following command to install the necessary packages: ```shell pip install twarc twarc-csv pandas ``` You should see output similar to the following: ![](images/pip_install.png) ### Our first command: making sure everything is working Let's open a terminal and get started - just like when installing twarc, you will want to use the `cmd` application on windows and the `Terminal` application on Mac. The first command we want to run is to check if everything in twarc is installed and working correctly. We'll use twarc's builtin `help` for this. Running the following command should show you a brief overview of the functionality that the twarc2 command provides and some of the options available: ```shell twarc2 --help ``` ![](images/twarc_help.png) Twarc is structured like many other command line applications: there is a single main command, `twarc2`, to launch the application, and then you provide a subcommand, or additional arguments, or flags to provide additional context about what that command should actually do. In this case we're only launching the `twarc2` command, and providing a single _flag_ `--help` (the double-dash syntax is usually used for this). Most terminal applications will have a `--help` or `-h` flag that will provide some useful information about the application you're running. This often includes example usage, options, and a short description. Note also that often when reading commands out loud, the space in between words is not mentioned explicitly: the command above (`twarc2 --help`) might be read as "twarc-two dash dash help". Though we won't cover the command line outside of using Twarc in this tutorial, your operating system's command line functionality is extensive and can help you automate a lot of otherwise tedious tasks. If you're interested in learning more the [Software Carpentry lesson on the shell](https://swcarpentry.github.io/shell-novice/) is a good starting point. ### Configuring twarc with our bearer token The next thing we want to do is tell twarc about our bearer token so we can authenticate ourselves with the Twitter API. This can be done using twarc's `configure` command. In this case we're going to use the `twarc2` main command, and provide it with the subcommand `configure` to tell twarc we want to start the configuration process. ``` twarc2 configure ``` On running this command twarc will prompt us to paste our bearer token, as shown in the screenshot below. Note that for many command line terminals on Windows, using the usual `Ctrl+V` keyboard shortcut will not work by default. If this happens, try right-clicking,then click `paste` to achieve the same thing. After entering our token, we will be prompted to enter additional information - this is not necessary for this tutorial, so we will skip this step by typing the letter `n` and hitting `enter`. ![](images/twarc_configure.png) ## Introduction to Twitter search and counts To tackle the research question we're interested in we're going to use the search endpoint to retrieve two sets of tweets: those using the word echidna, and those using the word platypus. There are two key commands that the Twitter API provides for search: a `search` endpoint to retrieve tweets matching a particular query, and a `counts` endpoint to tell you how many tweets match that query over time. It's always a good idea to start with the `counts` endpoint first, because: - it lets you establish early on how many tweets you will need to deal with: too many or too few matching tweets will help you determine whether your search strategy is reasonable - it can take a long time to retrieve large numbers of tweets and its better to know in advance how much data you will need to deal with - the count and trend over time is useful in and of itself - if you accidentally search for the wrong thing you can consume your monthly quota of tweets without collecting anything useful Let's get started with the `counts` API - in twarc this is accessible by the command `counts`. As before `twarc2` is our entry command, `counts` is the subcommand we're interested in, and the `echidna` is what we're interested in searching for on Twitter (the query). ```shell twarc2 counts echidna ``` You should see something like the below screenshot - and yes, this output isn't very readable! By default twarc shows us the response in the JSON format directly from the Twitter API, so it's not great for using directly on the command line. ![](images/twarc_count_echidna.png) Let's improve this by updating our command to: ```shell twarc2 counts echidna --text --granularity day ``` And we should see output like below (your results will be different, because you're searching on a different day to when these screenshots were captured). Note that the `--text` and `--granularity` are optional flags provided to the `twarc2 counts` command, we can see other options by running `twarc2 counts --help`. In this case `--text` returns a simplified text output for easier reading, and `--granularity day` is passed to the Twitter API to specify that we're interested only in daily counts of tweets, not the default hourly count. ```shell 2022-11-03T02:49:02.000Z - 2022-11-04T00:00:00.000Z: 974 2022-11-04T00:00:00.000Z - 2022-11-05T00:00:00.000Z: 802 2022-11-05T00:00:00.000Z - 2022-11-06T00:00:00.000Z: 527 2022-11-06T00:00:00.000Z - 2022-11-07T00:00:00.000Z: 554 2022-11-07T00:00:00.000Z - 2022-11-08T00:00:00.000Z: 883 2022-11-08T00:00:00.000Z - 2022-11-09T00:00:00.000Z: 723 2022-11-09T00:00:00.000Z - 2022-11-10T00:00:00.000Z: 1,567 2022-11-10T00:00:00.000Z - 2022-11-10T02:49:02.000Z: 219 ``` Note that this is only the count for the last seven days, which is the level of search functionality available for all developers via the standard track of the Twitter API. If you have access to the [Twitter Academic track](https://developer.twitter.com/en/use-cases/do-research/academic-research), you can switch to searching the full Twitter archive from the `counts` and `search` commands by adding the `--archive` flag. Twitter search is powerful and provides many rich options. However, it also functions a little differently to most other search engines, because Twitter search does not focus on _ranking_ tweets by relevance (like a web search engine does). Instead, Twitter search via the API focuses on retrieving all matching tweets in chronological order. In other words, Twitter search uses the [Boolean model of searching](https://nlp.stanford.edu/IR-book/html/htmledition/boolean-retrieval-1.html), and returns the documents that match exactly what you provide and nothing else. Let's work through this example a little further, first we want to expand to capture more variants of the word echidna - note that Twitter search via the API matches on the whole word, so `echidna` and `echidnas` are different. You can also see that we've added some double quotes around our query - without these quotes the individual pieces of our query might be interpreted as additional arguments to our search command: ```shell twarc2 counts "echidna echidna's echidnas" --granularity day --text ``` ```console 2022-11-03T03:40:44.000Z - 2022-11-04T00:00:00.000Z: 0 2022-11-04T00:00:00.000Z - 2022-11-05T00:00:00.000Z: 0 2022-11-05T00:00:00.000Z - 2022-11-06T00:00:00.000Z: 0 2022-11-06T00:00:00.000Z - 2022-11-07T00:00:00.000Z: 0 2022-11-07T00:00:00.000Z - 2022-11-08T00:00:00.000Z: 0 2022-11-08T00:00:00.000Z - 2022-11-09T00:00:00.000Z: 0 2022-11-09T00:00:00.000Z - 2022-11-10T00:00:00.000Z: 0 2022-11-10T00:00:00.000Z - 2022-11-10T03:40:44.000Z: 0 ``` Suddenly we're retrieving very few results! By default, if you don't specify an operator, the Twitter API assumes you mean AND, or that all of the words should be present - we will need to explicitly say that we want any of these words using the OR operator: ```shell twarc2 counts "echidna OR echidna's OR echidnas" --granularity day --text ``` ```console 2022-11-03T03:42:10.000Z - 2022-11-04T00:00:00.000Z: 964 2022-11-04T00:00:00.000Z - 2022-11-05T00:00:00.000Z: 846 2022-11-05T00:00:00.000Z - 2022-11-06T00:00:00.000Z: 552 2022-11-06T00:00:00.000Z - 2022-11-07T00:00:00.000Z: 573 2022-11-07T00:00:00.000Z - 2022-11-08T00:00:00.000Z: 962 2022-11-08T00:00:00.000Z - 2022-11-09T00:00:00.000Z: 758 2022-11-09T00:00:00.000Z - 2022-11-10T00:00:00.000Z: 1,591 2022-11-10T00:00:00.000Z - 2022-11-10T03:42:10.000Z: 288 ``` We can also apply operators based on other content or properties of tweets (see more [search operators](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query#list) in the Twitter API documentation). Because we're deciding to focus on the number of likes on tweets as our measure of coolness, we want to exclude retweets. If we don't exclude retweets, our like measure might be heavily influenced by one highly retweeted tweet. We can do this using the `-` (minus) operator, which allows us to exclude tweets matching a criteria, in conjunction with the `is:retweet` operator, which filters on whether the tweet is a retweet or not. If we applied just the `is:retweet` operator we'd only see the retweets, the opposite of what we want. ```shell twarc2 counts "echidna OR echidna's OR echidnas -is:retweet" --granularity day --text ``` ```text 2022-11-03T03:43:02.000Z - 2022-11-04T00:00:00.000Z: 957 2022-11-04T00:00:00.000Z - 2022-11-05T00:00:00.000Z: 826 2022-11-05T00:00:00.000Z - 2022-11-06T00:00:00.000Z: 546 2022-11-06T00:00:00.000Z - 2022-11-07T00:00:00.000Z: 570 2022-11-07T00:00:00.000Z - 2022-11-08T00:00:00.000Z: 931 2022-11-08T00:00:00.000Z - 2022-11-09T00:00:00.000Z: 750 2022-11-09T00:00:00.000Z - 2022-11-10T00:00:00.000Z: 1,587 2022-11-10T00:00:00.000Z - 2022-11-10T03:43:02.000Z: 288 ``` There's one tiny gotcha from the Twitter API here, which is important to know about. AND operators are applied before OR operators, even if the AND is not specified by the user. The query we wrote above actually means something like below. We're only removing the retweets containing the word "echidnas", not all retweets: ``` echidna OR echidna's OR (echidnas AND -is:retweet) ``` We can make our intent explicit by adding parentheses to group terms. This is a good idea in general to make your meaning clear, even if you know all of the operator rules. ```shell twarc2 counts "(echidna OR echidna's OR echidnas) -is:retweet" --granularity day --text ``` Now for the purposes of this tutorial we're going to stop exploring any further, but we could continue to refine and improve this query to match our research question. Twitter lets you build very long queries (up to 512 characters on the standard track and 1024 for the academic track) so you have plenty of scope to express yourself. As mentioned earlier, [Twitter's Query Builder](https://developer.twitter.com/apitools/query?query=) is an excellent tool for helping you to build your query. If we apply the same kind of process to the platypus case, we might end up with something like the following. In this case it was necessary to use the [Twitter search web interface](https://twitter.com/explore) to find some of the variations in the word platypus: ```shell twarc2 counts "(platypus OR platpus's OR platypi OR platypusses OR platypuses) -is:retweet" --granularity day --text ``` Having decided on the actual queries to run and examined the counts, now it's time to actually collect the tweets! We can take the queries we ran earlier, replace the `counts` command with the `search` and remove the `counts` specific arguments to get: ```shell twarc2 search "(echidna OR echidna's OR echidnas) -is:retweet" echidna.json twarc2 search "(platypus OR platpus's OR platypi OR platypusses OR platypuses) -is:retweet" platypus.json ``` Running these two commands will save the tweets matching each of those searches to two files on our disk, which we will use for the next sessions. ![Screenshot showing the progress of the tweets being downloaded](images/twarc_progress_download.png) TIP: if you're not sure where the files above have been saved, you can run the command `cd` on Windows, or `pwd` on Mac to have your shell print out the folder in the filesystem where twarc has been working. ## Understanding and transforming twitter JSON data Now that we've collected some data, it's time to take a look at it. Let's start by viewing the collected data in its plainest form: as a text file. Although we named the file with an extension of `.json`, this is just a convention: the actual file content is a plain text in the [JSON](https://en.wikipedia.org/wiki/JSON) format. Let's open this file with our inbuilt text editor (Notepad on Windows, TextEdit on Mac). ![Screenshot of the json file in notepad](images/json_echidna.png) You'll notice immediately that there is a *lot* of data in that file: tweets are rich objects, and we mentioned that twarc by default captures as much information as Twitter makes available. Further, the Twitter API provides data in a format that makes it convenient for machines to work with, but not so much for humans. ## Making a CSV file from our collected tweets We don't recommend trying to manually parse this raw data unless you have specific needs that aren't covered by existing tools. So we're going to use the `twarc-csv` package that we installed earlier to do the heavy lifting of transforming the collected JSON into a more friendly comma-separated value ([CSV](https://en.wikipedia.org/wiki/Comma-separated_values)) file. CSV is a simple plaintext format, but unlike JSON format is easy to import or open with a spreadsheet. The `twarc-csv` package lets us use a `csv` command to transform the files from twarc: ```shell twarc2 csv echidna.json echidna.csv twarc2 csv platypus.json platypus.csv ``` If we look at these files in our text editor again, we'll see a nice structure of one line per tweet, with all of the many columns for that tweet. ![Screenshot of the plaintext CSV file in notepad](images/echidna_csv.png) Since we're going to do more analysis with the Pandas library to answer our question, we will want to create the CSV with only the columns of interest. This will reduce the time and amount of computer memory/RAM you need to load your dataset. For example, the following commands produce CSV files with a small number of fields: ```shell twarc2 csv --output-columns id,created_at,author_id,text,referenced_tweets.retweeted.id,public_metrics.like_count echidna.json echidna_minimal.csv twarc2 csv --output-columns id,created_at,author_id,text,referenced_tweets.retweeted.id,public_metrics.like_count platypus.json platypus_minimal.csv ``` ### The problem with Excel It's tempting to try to open these CSV files directly in Excel, but if you do you're probably going to notice one or more of the following problems, as illustrated below: 1. The ID columns are likely to be broken. 2. Emoji and languages that don't use latin characters may not appear correctly. 3. Tweets may be broken up on newlines. 4. Excel can only support 1,048,576 rows - it's very easy to collect tweet datasets bigger than this. ![Screenshot of the broken CSV file opened directly in excel](images/excel_echidna.png) If you save a file from Excel with any of those problems that file is no longer useful for most purposes (this is a common and longstanding problem with using spreadsheet software, that affects many fields. For example in genomics: https://www.nature.com/articles/d41586-021-02211-4). While it is possible to make Excel do the right thing with your data, it takes more work, and a single mistake can lead to loss of important data. Therefore our recommendation is, if possible, to avoid the use of spreadsheets for analysing Twitter data. ### Working with Pandas If you are going to be using the scientific Python library [Pandas](https://pandas.pydata.org/) for any processing or analysis, you may wish to use Pandas methods. Pandas can be used to load and manipulate data like we have in our CSV file. Note that for this section we're going to run a very simple computation, the references will have links to more extensive resources for learning more. ```python # process_monotremes.py import pandas echidna = pandas.read_csv("echidna_minimal.csv") platypus = pandas.read_csv("platypus_minimal.csv") echidna_likes = echidna["public_metrics.like_count"].sum() platypus_likes = platypus["public_metrics.like_count"].sum() print(f"Total likes on echidna tweets: {echidna_likes}. Total likes on platypus tweets: {platypus_likes}.") ``` Run this script through Python to see which of the monotremes is the coolest: ```shell python process_monotremes.py ``` ### Answering the research question: which monotreme is the coolest? At the time of creating this tutorial, the above script run with the just collected data leads to the following result: ```shell Total likes on echidna tweets: 1787652. Total likes on platypus tweets: 3462715. ``` On that basis, we can conclude that at the time of running this search the platypus is nearly twice as cool as the echnida based on Twitter likes. Of course this is a simplistic approach to answering this specific research question - we could have made many other choices. Even using a simple quantitative approach looking at metrics: we could have chosen to look at other engagement counts like the number of retweets, or looked at the number of followers of the accounts tweeting about each animal (because a "cooler" account will have more followers). Much of the challenge in using Twitter for research is both about asking the right research question and also the choosing the right approach to the data to address that research question. ## Prepare a dataset for sharing/using a shared dataset Having performed this analysis and come to a conclusion, it is good practice to share the underlying data so other people can reproduce these results (with some caveats). Noting that we want to preserve Twitter users' agency over the availability of their content, and Twitter's Developer Agreement, we can do this by creating a dataset of tweet IDs. Instead of sharing the content of the tweets, we can share the unique ID for that tweet, which allows others to `hydrate` the tweets by retrieving them again from the Twitter API. This can be done as follows using twarc's `dehydrate` command: ```shell twarc2 dehydrate --id-type tweets platypus.json platypus_ids.txt twarc2 dehydrate --id-type tweets echidna.json echidna_ids.txt ``` These commands will produce the two text files, with each line in these files containing the unique ID of the tweet. To `hydrate`, or retrieve the tweets again, we can use the corresponding commands: ```shell twarc2 hydrate platypus_ids.txt platypus_hydrated.json twarc2 hydrate echidna_ids.txt echidna_hydrated.json ``` Note that the hydrated files will include fewer tweets: tweets that have been deleted, or tweets by accounts that have been deleted, suspended, or protected, will not be included in the file. Note also that hydrating a dataset also means that engagement metrics like retweets and likes will be up to date for tweets that are still available. ## Suggested resources You can find some additional links and resources in the [resources section](https://twarc-project.readthedocs.io/en/latest/resources/) of the twarc documentation. ================================================ FILE: docs/twarc1_en_us.md ================================================ twarc1 ===== ***For information about working with the Twitter V2 API please see the [twarc2](https://twarc-project.readthedocs.io/en/latest/twarc2/) page.*** --- twarc is a command line tool and Python library for archiving Twitter JSON data. Each tweet is represented as a JSON object that is [exactly](https://dev.twitter.com/overview/api/tweets) what was returned from the Twitter API. Tweets are stored as [line-oriented JSON](https://en.wikipedia.org/wiki/JSON_Streaming#Line-delimited_JSON). twarc will handle Twitter API's [rate limits](https://dev.twitter.com/rest/public/rate-limiting) for you. In addition to letting you collect tweets twarc can also help you collect users, trends and hydrate tweet ids. twarc was developed as part of the [Documenting the Now](http://www.docnow.io) project which was funded by the [Mellon Foundation](https://mellon.org/). ## Install Before using twarc you will need to register an application at [apps.twitter.com](http://apps.twitter.com). Once you've created your application, note down the consumer key, consumer secret and then click to generate an access token and access token secret. With these four variables in hand you are ready to start using twarc. 1. install [Python 3](http://python.org/download) 2. [pip](https://pip.pypa.io/en/stable/installing/) install twarc: ``` pip install --upgrade twarc ``` ### Homebrew (macOS only) For macOS users, you can also install `twarc` via [Homebrew](https://brew.sh/): ```bash $ brew install twarc ``` ### Windows If you installed with pip and see a "failed to create process" when running twarc try reinstalling like this: python -m pip install --upgrade --force-reinstall twarc ## Quickstart: First you're going to need to tell twarc about your application API keys and grant access to one or more Twitter accounts: twarc configure Then try out a search: twarc search blacklivesmatter > search.jsonl Or maybe you'd like to collect tweets as they happen? twarc filter blacklivesmatter > stream.jsonl See below for the details about these commands and more. ## Usage ### Configure Once you've got your application keys you can tell twarc what they are with the `configure` command. twarc configure This will store your credentials in a file called `.twarc` in your home directory so you don't have to keep entering them in. If you would rather supply them directly you can set them in the environment (`CONSUMER_KEY`, `CONSUMER_SECRET`, `ACCESS_TOKEN`, `ACCESS_TOKEN_SECRET`) or using command line options (`--consumer_key`, `--consumer_secret`, `--access_token`, `--access_token_secret`). ### Search This uses Twitter's [search/tweets](https://dev.twitter.com/rest/reference/get/search/tweets) to download *pre-existing* tweets matching a given query. twarc search blacklivesmatter > tweets.jsonl It's important to note that `search` will return tweets that are found within a 7 day window that Twitter's search API imposes. If this seems like a small window, it is, but you may be interested in collecting tweets as they happen using the `filter` and `sample` commands below. The best way to get familiar with Twitter's search syntax is to experiment with [Twitter's Advanced Search](https://twitter.com/search-advanced) and copy and pasting the resulting query from the search box. For example here is a more complicated query that searches for tweets containing either the \#blacklivesmatter or #blm hashtags that were sent to deray. twarc search '#blacklivesmatter OR #blm to:deray' > tweets.jsonl You also should definitely check out Igor Brigadir's *excellent* reference guide to the Twitter Search syntax: [Advanced Search on Twitter](https://github.com/igorbrigadir/twitter-advanced-search/blob/master/README.md). There are lots of hidden gems in there that the advanced search form doesn't make readily apparent. Twitter attempts to code the language of a tweet, and you can limit your search to a particular language if you want using an [ISO 639-1] code: twarc search '#blacklivesmatter' --lang fr > tweets.jsonl You can also search for tweets with a given location, for example tweets mentioning *blacklivesmatter* that are 1 mile from the center of Ferguson, Missouri: twarc search blacklivesmatter --geocode 38.7442,-90.3054,1mi > tweets.jsonl If a search query isn't supplied when using `--geocode` you will get all tweets relevant for that location and radius: twarc search --geocode 38.7442,-90.3054,1mi > tweets.jsonl ### Filter The `filter` command will use Twitter's [statuses/filter](https://developer.twitter.com/en/docs/twitter-api/v1/tweets/filter-realtime/api-reference/post-statuses-filter) API to collect tweets as they happen. twarc filter blacklivesmatter,blm > tweets.jsonl Please note that the syntax for the Twitter's track queries is significantly different than what queries in their search API. Consult the [track documentation](https://developer.twitter.com/en/docs/twitter-api/v1/tweets/filter-realtime/guides/basic-stream-parameters#track) on how best to express the filter option you are using. Use the `follow` command line argument if you would like to collect tweets from a given user id as they happen. This includes retweets. For example this will collect tweets and retweets from CNN: twarc filter --follow 759251 > tweets.jsonl You can also collect tweets using a bounding box. Note: the leading dash needs to be escaped in the bounding box or else it will be interpreted as a command line argument! twarc filter --locations "\-74,40,-73,41" > tweets.jsonl You can use the `lang` command line argument to pass in a [ISO 639-1] language code to limit to, and since the filter stream allow you to filter by one more languages it is repeatable. So this would collect tweets that mention paris or madrid that were made in French or Spanish: twarc filter paris,madrid --lang fr --lang es If you combine filter and follow options they are OR'ed together. For example this will collect tweets that use the blacklivesmatter or blm hashtags and also tweets from user CNN: twarc filter blacklivesmatter,blm --follow 759251 > tweets.jsonl But combining locations and languages will result effectively in an AND. For example this will collect tweets from the greater New York area that are in Spanish or French: twarc filter --locations "\-74,40,-73,41" --lang es --lang fr ### Sample Use the `sample` command to listen to Twitter's [statuses/sample](https://dev.twitter.com/streaming/reference/get/statuses/sample) API for a "random" sample of recent public statuses. twarc sample > tweets.jsonl ### Dehydrate The `dehydrate` command generates an id list from a file of tweets: twarc dehydrate tweets.jsonl > tweet-ids.txt ### Hydrate twarc's `hydrate` command will read a file of tweet identifiers and write out the tweet JSON for them using Twitter's [status/lookup](https://dev.twitter.com/rest/reference/get/statuses/lookup) API. twarc hydrate ids.txt > tweets.jsonl Twitter API's [Terms of Service](https://dev.twitter.com/overview/terms/policy#6._Be_a_Good_Partner_to_Twitter) discourage people from making large amounts of raw Twitter data available on the Web. The data can be used for research and archived for local use, but not shared with the world. Twitter does allow files of tweet identifiers to be shared, which can be useful when you would like to make a dataset of tweets available. You can then use Twitter's API to *hydrate* the data, or to retrieve the full JSON for each identifier. This is particularly important for [verification](https://en.wikipedia.org/wiki/Reproducibility) of social media research. ### Users The `users` command will return User metadata for the given screen names. twarc users deray,Nettaaaaaaaa > users.jsonl You can also give it user ids: twarc users 1232134,1413213 > users.jsonl If you want you can also use a file of user ids, which can be useful if you are using the `followers` and `friends` commands below: twarc users ids.txt > users.jsonl ### Followers The `followers` command will use Twitter's [follower id API](https://dev.twitter.com/rest/reference/get/followers/ids) to collect the follower user ids for exactly one user screen name per request as specified as an argument: twarc followers deray > follower_ids.txt The result will include exactly one user id per line. The response order is reverse chronological, or most recent followers first. ### Friends Like the `followers` command, the `friends` command will use Twitter's [friend id API](https://dev.twitter.com/rest/reference/get/friends/ids) to collect the friend user ids for exactly one user screen name per request as specified as an argument: twarc friends deray > friend_ids.txt ### Trends The `trends` command lets you retrieve information from Twitter's API about trending hashtags. You need to supply a [Where On Earth](https://web.archive.org/web/20180102203025/https://developer.yahoo.com/geo/geoplanet/) identifier (`woeid`) to indicate what trends you are interested in. For example here's how you can get the current trends for St Louis: twarc trends 2486982 Using a `woeid` of 1 will return trends for the entire planet: twarc trends 1 If you aren't sure what to use as a `woeid` just omit it and you will get a list of all the places for which Twitter tracks trends: twarc trends If you have a geo-location you can use it instead of the `woedid`. twarc trends 39.9062,-79.4679 Behind the scenes twarc will lookup the location using Twitter's [trends/closest](https://dev.twitter.com/rest/reference/get/trends/closest) API to find the nearest `woeid`. ### Timeline The `timeline` command will use Twitter's [user timeline API](https://dev.twitter.com/rest/reference/get/statuses/user_timeline) to collect the most recent tweets posted by the user indicated by screen_name. twarc timeline deray > tweets.jsonl You can also look up users using a user id: twarc timeline 12345 > tweets.jsonl ### Retweets You can get retweets for a given tweet id like so: twarc retweets 824077910927691778 > retweets.jsonl If you have tweet_ids that you would like to fetch the retweets for, you can: twarc retweets ids.txt > retweets.jsonl ### Replies Unfortunately Twitter's API does not currently support getting replies to a tweet. So twarc approximates it by using the search API. Since the search API does not support getting tweets older than a week, twarc can only get the replies to a tweet that have been sent in the last week. If you want to get the replies to a given tweet you can: twarc replies 824077910927691778 > replies.jsonl Using the `--recursive` option will also fetch replies to the replies as well as quotes. This can take a long time to complete for a large thread because of rate limiting by the search API. twarc replies 824077910927691778 --recursive ### Lists To get the users that are on a list you can use the list URL with the `listmembers` command: twarc listmembers https://twitter.com/edsu/lists/bots ## Premium Search API Twitter introduced a Premium Search API that lets you pay Twitter money for tweets. Once you have set up an environment in your [dashboard](https://developer.twitter.com/en/dashboard) you can use their 30day and fullarchive endpoints to search for tweets outside the 7 day window provided by the Standard Search API. To use the premium API from the command line you will need to indicate which endpoint you are using, and the environment. To avoid using up your entire budget you will likely want to limit the time range using `--to_date` and `--from_date`. Additionally you can limit the maximum number of tweets returned using `--limit`. So for example, if I wanted to get all the blacklivesmatter tweets from a two weeks ago (assuming today is June 1, 2020) using my environment named *docnowdev* but not retrieving more than 1000 tweets, I could: twarc search blacklivesmatter \ --30day docnowdev \ --from_date 2020-05-01 \ --to_date 2020-05-14 \ --limit 1000 \ > tweets.jsonl Similarly, to find tweets from 2014 using the full archive you can: twarc search blacklivesmatter \ --fullarchive docnowdev \ --from_date 2014-08-04 \ --to_date 2014-08-05 \ --limit 1000 \ > tweets.jsonl If your environment is sandboxed you will need to use `--sandbox` so that twarc knows not to request more than 100 tweets at a time (the default for non-sandboxed environments is 500) twarc search blacklivesmatter \ --fullarchive docnowdev \ --from_date 2014-08-04 \ --to_date 2014-08-05 \ --limit 1000 \ --sandbox \ > tweets.jsonl ## Gnip Enterprise API twarc supports integration with the Gnip Twitter Full-Archive Enterprise API. To do so, you must pass in the `--gnip_auth` argument. Additionally, set the `GNIP_USERNAME`, `GNIP_PASSWORD`, and `GNIP_ACCOUNT` environment variables. You can then run the following: twarc search blacklivesmatter \ --gnip_auth \ --gnip_fullarchive prod \ --from_date 2014-08-04 \ --to_date 2015-08-05 \ --limit 1000 \ > tweets.jsonl ## Use as a Library If you want you can use twarc programmatically as a library to collect tweets. You first need to create a `twarc` instance (using your Twitter credentials), and then use it to iterate through search results, filter results or lookup results. ```python from twarc import Twarc t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) for tweet in t.search("ferguson"): print(tweet["text"]) ``` You can do the same for a filter stream of new tweets that match a track keyword ```python for tweet in t.filter(track="ferguson"): print(tweet["text"]) ``` or location: ```python for tweet in t.filter(locations="-74,40,-73,41"): print(tweet["text"]) ``` or user ids: ```python for tweet in t.filter(follow='12345,678910'): print(tweet["text"]) ``` Similarly you can hydrate tweet identifiers by passing in a list of ids or a generator: ```python for tweet in t.hydrate(open('ids.txt')): print(tweet["text"]) ``` ## User vs App Auth twarc will manage rate limiting by Twitter. However, you should know that their rate limiting varies based on the way that you authenticate. The two options are User Auth and App Auth. twarc defaults to using User Auth but you can tell it to use App Auth. Switching to App Auth can be handy in some situations like when you are searching tweets, since User Auth can only issue 180 requests every 15 minutes (1.6 million tweets per day), but App Auth can issue 450 (4.3 million tweets per day). But be careful: the `statuses/lookup` endpoint used by the hydrate subcommand has a rate limit of 900 requests per 15 minutes for User Auth, and 300 request per 15 minutes for App Auth. If you know what you are doing and want to force App Auth, you can use the `--app_auth` command line option: twarc --app_auth search ferguson > tweets.jsonl Similarly, if you are using twarc as a library you can: ```python from twarc import Twarc t = Twarc(app_auth=True) for tweet in t.search('ferguson'): print(tweet['id_str']) ``` ## Utilities In the utils directory there are some simple command line utilities for working with the line-oriented JSON, like printing out the archived tweets as text or html, extracting the usernames, referenced URLs, etc. If you create a script that you find handy please send a pull request. When you've got some tweets you can create a rudimentary wall of them: utils/wall.py tweets.jsonl > tweets.html You can create a word cloud of tweets you collected about nasa: utils/wordcloud.py tweets.jsonl > wordcloud.html If you've collected some tweets using `replies` you can create a static D3 visualization of them with: utils/network.py tweets.jsonl tweets.html Optionally you can consolidate tweets by user, allowing you to see central accounts: utils/network.py --users tweets.jsonl tweets.html Additionally, you can create a network of hashtags, allowing you to view their colocation: utils/network.py --hashtags tweets.jsonl tweets.html And if you want to use the network graph in a program like [Gephi](https://gephi.org/), you can generate a GEXF file with the following: utils/network.py --users tweets.jsonl tweets.gexf utils/network.py --hashtags tweets.jsonl tweets.gexf Additionally if you want to convert the network into a dynamic network with timeline enabled (i.e. nodes will appear and disappear according to their attributes), you can open up your GEXF file in Gephi and follow [these instructions](https://seinecle.github.io/gephi-tutorials/generated-html/converting-a-network-with-dates-into-dynamic.html). Note that in tweets.gexf there is a column for "start_date" (which is the day the post was created) but none for "end_date" and that in the dynamic timeline, the nodes will appear on the screen at their start date and stay on screen forever after. For the "Time Interval creation options" pop-up in Gephi, the "Start time column" should be "start_date", the "End time column" should be empty, the "Parse dates" should be selected, and the Date format should be the last option, "dd/MM/yyyy HH:mm:ss". gender.py is a filter which allows you to filter tweets based on a guess about the gender of the author. So for example you can filter out all the tweets that look like they were from women, and create a word cloud for them: utils/gender.py --gender female tweets.jsonl | utils/wordcloud.py > tweets-female.html You can output [GeoJSON](http://geojson.org/) from tweets where geo coordinates are available: utils/geojson.py tweets.jsonl > tweets.geojson Optionally you can export GeoJSON with centroids replacing bounding boxes: utils/geojson.py tweets.jsonl --centroid > tweets.geojson And if you do export GeoJSON with centroids, you can add some random fuzzing: utils/geojson.py tweets.jsonl --centroid --fuzz 0.01 > tweets.geojson To filter tweets by presence or absence of geo coordinates (or Place, see [API documentation](https://dev.twitter.com/overview/api/places)): utils/geofilter.py tweets.jsonl --yes-coordinates > tweets-with-geocoords.jsonl cat tweets.jsonl | utils/geofilter.py --no-place > tweets-with-no-place.jsonl To filter tweets by a GeoJSON fence (requires [Shapely](https://github.com/Toblerity/Shapely)): utils/geofilter.py tweets.jsonl --fence limits.geojson > fenced-tweets.jsonl cat tweets.jsonl | utils/geofilter.py --fence limits.geojson > fenced-tweets.jsonl If you suspect you have duplicate in your tweets you can dedupe them: utils/deduplicate.py tweets.jsonl > deduped.jsonl You can sort by ID, which is analogous to sorting by time: utils/sort_by_id.py tweets.jsonl > sorted.jsonl You can filter out all tweets before a certain date (for example, if a hashtag was used for another event before the one you're interested in): utils/filter_date.py --mindate 1-may-2014 tweets.jsonl > filtered.jsonl You can get an HTML list of the clients used: utils/source.py tweets.jsonl > sources.html If you want to remove the retweets: utils/noretweets.py tweets.jsonl > tweets_noretweets.jsonl Or unshorten urls (requires [unshrtn](https://github.com/docnow/unshrtn)): cat tweets.jsonl | utils/unshrtn.py > unshortened.jsonl Once you unshorten your URLs you can get a ranked list of most-tweeted URLs: cat unshortened.jsonl | utils/urls.py | sort | uniq -c | sort -nr > urls.txt ## twarc-report Some further utility scripts to generate csv or json output suitable for use with [D3.js](http://d3js.org/) visualizations are found in the [twarc-report](https://github.com/pbinkley/twarc-report) project. The util `directed.py`, formerly part of twarc, has moved to twarc-report as `d3graph.py`. Each script can also generate an html demo of a D3 visualization, e.g. [timelines](https://wallandbinkley.com/twarc/bill10/) or a [directed graph of retweets](https://wallandbinkley.com/twarc/bill10/directed-retweets.html). [Chinese]: https://github.com/DocNow/twarc/blob/main/README_zw_zh.md [Japanese]: https://github.com/DocNow/twarc/blob/main/README_ja_jp.md [Portuguese]: https://github.com/DocNow/twarc/blob/main/README_pt_br.md [Spanish]: https://github.com/DocNow/twarc/blob/main/README_es_mx.md [Swedish]: https://github.com/DocNow/twarc/blob/main/README_sv_se.md [Swahili]: https://github.com/DocNow/twarc/blob/main/README_sw_ke.md [ISO 639-1]: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes ================================================ FILE: docs/twarc1_es_mx.md ================================================ # twarc1 twarc es una recurso de línea de commando y catálogo de Python para archivar JSON dato de Twitter. Cada tweet se representa como un artículo de JSON que es [exactamente](https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object) lo que fue capturado del API de Twitter. Los Tweets se archivan como [JSON de línea orientado](https://en.wikipedia.org/wiki/JSON_streaming#Line-delimited_JSON). twarc se encarga del [límite de tarifa](https://developer.twitter.com/en/docs/basics/rate-limiting) del API de Twitter. twarc también puede facilitar la colección de usuarios, tendencias y detallar las identificaciones de los tweets. twarc fue desarrollado como parte del proyecto [Documenting the Now](http://www.docnow.io/) el cual fue financiado por el [Mellon Foundation](https://mellon.org/). ## La Instalación Antes de usar twarc es necesario registrarse por [apps.twitter.com](https://apps.twitter.com/). Después de establecer la solicitud, se anota el clabe del consumidor, el secreto del consumidor, y entoces clickear para generar un access token y el secretro del access token. Con estos quatros requisitos, está listo para usar twarc. 1. Instala [Python](https://www.python.org/downloads/) (2 ó 3) 2. Instala twarc atraves de pip (si estas acezando de categoría: pip install --upgrade twarc) ## Quickstart: Para empezar, se nececita dirigir a twarc sobre los claves de API: `twarc configure` Prueba una búsqueda: `twarc search blacklivesmatter > search.josnl` ¿O quizás, preferirá coleccionar tweets en tiempo real? `twarc filter blacklivesmatter > stream.josnl` Vea abajo por detalles sobre estos commandos y más. ## Uso ### Configure Una vez que tenga sus claves de aplicación, puede dirigir a twarc lo que son con el commando `configure`. `twarc configure` Esto archiva sus credenciales en un archivo que se llama `.twarc` en su directorio personal para que no tenga que volver a ingresar los datos. Si prefiere ingresar los datos directamente, se puede establecer en el ambiente `(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)` o usando las opciones de línea commando `(--consumer_key, --consumer_secret, --access_token, --access_token_secret)`. ### Search Esto se usa para [las búsquedas](https://developer.twitter.com/en/docs/api-reference-index) de Twitter para descargar *preexistentes* tweets que corresponde a una consulta en particular. `twarc search blacklivesmatter > tweets.jsonl` Es importante a notar que este `search` dara resultados los tweets que se encuentran dentro de una ventana de siete dias como se imponga la búsqueda del Twitter API. Si parece una ventana mínima, lo es, pero puede ser que el interés es en coleccionar tweets en tiempo real usando `filter` y `sample` commandos detallados abajo. La mejor manera de familiares con la búsqueda de syntax de Twitter es experimentado con el [Búsqueda Avanzada de Twitter](https://twitter.com/search-advanced) y copiar y pegar la consulta de la caja de búsqueda. Por ejemplo, abajo hay una consulta más complicada que busca los tweets que contienen #blacklivesmatter OR #blm hastags que se enviaron a deray. `twarc search '#blacklivesmatter OR #blm to:deray' > tweets.jsonl` Twitter puede codificar el lenguaje de un tweet, y puede limitar su búsqueda a un lenguaje particular: `twarc search '#blacklivesmatter' --lang fr > tweets.jsonl` También, puede buscar tweets dentro de un lugar geográfico, por ejemplo, los tweets que menciona blacklivesmatter que están a una milla del centro de Ferguson, Missouri: `twarc search blacklivesmatter --geocode 38.7442,-90.3054,1mi > tweets.jsonl` Si una bsqueda no está identificado cuando se usa "--geocode" se regresa a los tweets en esa ubicación y radio: `twarc search --geocode 38.7442,-90.3054,1mi > tweets.jsonl` ### Filter El commando "filter" se usa Twitter's ["status/filter"](https://developer.twitter.com/en/docs/tutorials/consuming-streaming-data) API para coleccionar tweets en tiempo real. `twarc filter blacklivesmatter,blm > tweets.jsonl` Favor de notar que el sintaxis para los track queries de Twitter es differente de las búsquedas en el search API. Favor de consultar la documentación. Use el commando `follow` para coleccionar tweets de una identificación de usuario en particular en tiempo real. Incluye retweets. Por ejemplo, esto colecciona tweets y retweets de CNN: `twarc filter --follow 759251 > tweets.jsonl` También se puede coleccionar tweets usando un "bounding box". Nota: ¡el primer guion necesita estar escapado en el "bounding box" si no, estará interpretado como un argumento de línea de commando! `twarc filter --locations "\-74,40,-73,41" > tweets.jsonl` Si combina las opciones serán "OR'ed" juntos. Por ejemplo, esto colecciona los tweets que usan los hashtags de blacklivesmatter o blm y tambien tweets del usario CNN: `twarc filter blacklivesmatter,blm --follow 759251 > tweets.jsonl` ### Sample Usa el commando `sample` para probar a los [statuses/API de muestra](https://developer.twitter.com/en/docs/tutorials/consuming-streaming-data) para una muestra "azar" de tweets recientes. `twarc sample > tweets.jsonl` ### Dehydrate El commando `dehydrate` genera una lista de id's de un archivo de tweets: `twarc dehydrate tweets.jsonl > tweet-ids.txt` ### Hydrate El mando `hydrate` busca a través de un archivo de identificadores y regresa el JSON del tweet usando el ["status/lookup API"](https://developer.twitter.com/en/docs/api-reference-index). `twarc hydrate ids.txt > tweets.jsonl` Los [términos de servicio](https://developer.twitter.com/en/developer-terms/policy#6._Be_a_Good_Partner_to_Twitter) del API de Twitter desalientan los usuarios a hacer público por el internet los datos de Twitter. Los datos se pueden usar para el estudio y archivado para uso local, pero no para compartir público. Aún, Twitter permite archivos de identificadores de Twitter ser compartidos. Puede usar el API de Twitter para hidratar los datos, o recuperar el completo JSON dato. Esto es importante para la [verificación](https://en.wikipedia.org/wiki/Reproducibility) del estudio de los redes sociales. ### Users El commando `user` regresa metadata de usuario para los nobres de pantalla. `twarc users deray,Nettaaaaaaaa > users.jsonl` También puede acceder ids de usuario: `twarc users 1232134,1413213 > users.jsonl` Si quiere, también se puede usar un archivo de user ids: `twarc users ids.txt > users.jsonl` ### Followers El commando `followers` usa el [follower id API](https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-followers-ids) para coleccionar los user ids para un nombre de pantalla por búsqueda: `twarc followers deray > follower_ids.txt` El resultado incluye un user id por cada línea. El orden es en reversa cronológica, o los followers más recientes. ### Friends El commando `friends` usa el [friend id API](https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-friends-ids) de Twitter para coleccionar los friend user ids para un nombre de pantalla por búsqueda: `twarc friends deray > friend_ids.txt` ### Trends El commando `trends` regresa información del Twitter API sobre los hashtags populares. Necesita ingresar un [Where on Earth idenfier (`woeid`)](https://en.wikipedia.org/wiki/WOEID) para indicar cual temas quieres buscar. Por ejemplo: `twarc trends 2486982` Usando un woeid de 1 regresara temas para el planeta: `twarc trends 1` También se puede omitir el `woeid` y los datos que regresan serán una lista de los lugares por donde Twitter localiza las temas: `twarc trends` Si tiene un geo-location, puede usarlo. `twarc trends 39.9062,-79.4679` twarc buscara el lugar usando el [trends/closest](https://developer.twitter.com/en/docs/api-reference-index) API para encontrar el `woeid` más cerca. ### Timeline El commando `timeline` usa el [user timeline API](https://developer.twitter.com/en/docs/api-reference-index) para coleccionar los tweets más recientes del usuario indicado por el nombre de pantalla. `twarc timeline deray > tweets.jsonl` También se puede buscar usuarios usando un user id: `twarc timeline 12345 > tweets.jsonl` ### Retweets Se puede buscar retweets de un tweet específico: `twarc retweets 824077910927691778 > retweets.jsonl` ### Replies Desafortunadamente, el API de Twitter no soporte buscando respuestas a un tweet. Entonces, twarc usa el search API. EL search API no regresa tweets mayores de siete días. Si quieres buscar las respuestas de un tweet: `twarc replies 824077910927691778 > replies.jsonl` El commando `--recursive` regresa respuestos a los respuestos. Esto puede tomar mucho tiempo para un thread muy grande porque el rate liming por el search API. `twarc replies 824077910927691778 --recursive` ### Lists Para conseguir los usuarios en una lista, se puede usar el list URL con el commando `listmembers`. `twarc listmembers https://twitter.com/edsu/lists/bots` ## Use as a Library twarc se puede usar programáticamente como una biblioteca para coleccionar tweets. Necesitas usar un `twarc` instance (usando tus credenciales de Twitter), y luego lo usas para buscar por resultados de búsqueda. `from twarc import Twarc t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) for tweet in t.search("ferguson"): print(tweet["text"])` Puedes usar lo mismo para el filtro de stream de nuevos de tweets que sean iguales al track keyword. `for tweet in t.filter(track="ferguson"): print(tweet["text"])` o lugar: `for tweet in t.filter(locations="-74,40,-73,41"): print(tweet["text"])` o user ids: `for tweet in t.filter(follow='12345,678910'): print(tweet["text"])` También los identificados de tweets se pueden hydratar: `for tweet in t.hydrate(open('ids.txt')): print(tweet["text"])` ## Utilities En el directorio de utilidades hay algunos commando simple de line utilities para trabajar conel line-oriented JSON, Como imprimiendo out the archived tweets as texto o html, extracting the usernames, referenced URLs, etc. Si creas un script que tú puedas encontrar fácilmente por favor envía un pull request. Cuando tengas algunos tweets puedes crear una pared rudimentaria de ellos: `% utils/wall.py tweets.jsonl > tweets.html` Puedes crear un word cloud de tweets que has coleccionado sobre nasa: `% utils/wordcloud.py tweets.jsonl > wordcloud.html` Si has coleccionado algunos tweets usando `replies` puedes crear a static D3 visualization de ellos con: `% utils/network.py tweets.jsonl tweets.html` Tienes la opción de consolidar tweets por user, permitiéndote ver las cuentas centrales: `% utils/network.py --users tweets.jsonl tweets.html` Y si quieres usar la graficas del network en un programa como [Gephi](https://gephi.org/), puedes generar un GEXF file con lo siguiente: `% utils/network.py --users tweets.jsonl tweets.gexf` gender.py es un filtro que te permite filtrar tweets basados en un guess sobre el género del autor. Por ejemplo, puedes filtrar todos los tweets que parecen ser de mujeres, y crear un word cloud para ellos: `% utils/gender.py --gender female tweets.jsonl | utils/wordcloud.py > tweets-female.html` Se puede usar [GeoJSON](http://geojson.org/) de tweets que tienen geo coordiates: `% utils/geojson.py tweets.jsonl > tweets.geojson` Tienes la opcion de exportar GeoJSON con centroids replacing bounding boxes: `% utils/geojson.py tweets.jsonl --centroid > tweets.geojson` Y si exportas GeoJSON with centroids, puedes añadir algunos random fuzzing: `% utils/geojson.py tweets.jsonl --centroid --fuzz 0.01 > tweets.geojson` Para filtrar tweets por presencia o ausencia de coordenadas geo (o por lugar Place, verifica [API documentacion](https://developer.twitter.com/en/docs/basics/getting-started)): `% utils/geofilter.py tweets.jsonl --yes-coordinates > tweets-with-geocoords.jsonl % cat tweets.jsonl | utils/geofilter.py --no-place > tweets-with-no-place.jsonl` Para filtrar con GeoJSON fence (se necesita [Shapely](https://github.com/Toblerity/Shapely)): `% utils/geofilter.py tweets.jsonl --fence limits.geojson > fenced-tweets.jsonl % cat tweets.jsonl | utils/geofilter.py --fence limits.geojson > fenced-tweets.jsonl` Si sospechas que tienes un duplicado en tus tweets se puede usar "dedupe": `% utils/deduplicate.py tweets.jsonl > deduped.jsonl` Para ordernar por ID: `% utils/sort_by_id.py tweets.jsonl > sorted.jsonl` Puedes filtrar todos los tweets antes de una fecha exacta (Por ejemplo, si un hashtag fue usado para otro evento antes del que te interesaba): `% utils/filter_date.py --mindate 1-may-2014 tweets.jsonl > filtered.jsonl` Puedes conseguir un listado de HTML de clientes usados: `% utils/source.py tweets.jsonl > sources.html` Si deseas remover los retweets: `% utils/noretweets.py tweets.jsonl > tweets_noretweets.jsonl` O unshorten urls (se necesita [unshrtn](https://github.com/DocNow/unshrtn)): `% cat tweets.jsonl | utils/unshorten.py > unshortened.jsonl` Una vez hayas unshorten tus URLs puedes obtener un listado de los most-tweeted URLs: `% cat unshortened.jsonl | utils/urls.py | sort | uniq -c | sort -nr > urls.txt` ## twarc-report Más commandos de "utility" para generar csv or json output con uso con [D3.js](https://d3js.org/) visualizaciónes son encontrados en el [twarc-report](https://github.com/pbinkley/twarc-report) project. El util `directed.py` ahora es `d3graph.py`. Cada script también puede generar un html demo de D3 visualization, e.g. [timelines](https://www.wallandbinkley.com/twarc/bill10/) o una [gráfica dirigida de retweets](https://www.wallandbinkley.com/twarc/bill10/directed-retweets.html). --- Crédito de tradução: [Tina Figueroa] [japonés]: https://github.com/DocNow/twarc/blob/main/README_ja_jp.md [Portugués]: https://github.com/DocNow/twarc/blob/main/README_pt_br.md [Inglés]: https://github.com/DocNow/twarc/blob/main/README.md [Sueco]: https://github.com/DocNow/twarc/blob/main/README_sv_se.md [Swahili]: https://github.com/DocNow/twarc/blob/main/README_sw_ke.md [Tina Figueroa]: https://github.com/@tinafigueroa ================================================ FILE: docs/twarc1_ja_jp.md ================================================ twarc1 ===== twarcは、TwitterのJSONデータをアーカイブするためのコマンドラインツールおよびPythonライブラリーのプログラムです。 - 各ツイートは、Twitter APIから返された内容を[正確に](https://dev.twitter.com/overview/api/tweets)表すJSONオブジェクトとして表示されます。 - ツイートは[line-oriented JSON](https://en.wikipedia.org/wiki/JSON_Streaming#Line-delimited_JSON)として保存されます。 - twarcがTwitterのAPI[レート制限](https://dev.twitter.com/rest/public/rate-limiting)を処理してくれます。 - twarcはツイートを収集できるだけでなく、ユーザー、トレンド、ツイートIDの詳細な情報の収集(hydrate; ハイドレート)にも役立ちます。 twarcは[Mellon Foundation](https://mellon.org/)によって援助された[Documenting the Now](http://www.docnow.io)プロジェクトの一環として開発されました. ## Install | インストール twarcを使う前に[Twitter Developers](http://apps.twitter.com)にあなたのアプリケーションを登録する必要があります. 登録したら, コンシューマーキーとその秘密鍵を控えておきます. そして「Create my access token」をクリックして、アクセストークンと秘密鍵を生成して控えておいてください. これら4つの鍵が手元に揃えば, twarcを使い始める準備は完了です. 1. [Python](http://python.org/download)をインストールする (Version2か3) 2. [pip](https://pip.pypa.io/en/stable/installing/) install twarcする ### Homebrew (macOSだけ) `twarc`は以下によってインストールできます. ```bash $ brew install twarc ``` ## Quickstart | クイックスタート まず初めに, アプリケーションのAPIキーをtwarcに教え, 1つ以上のTwitterアカウントへのアクセスを許可する必要があります. twarc configure 検索を試してみましょう. twarc search blacklivesmatter > search.jsonl または, 呟かれたツイートを収集したいですか? twarc filter blacklivesmatter > stream.jsonl コマンドなどの詳細については, 以下を参照してください. ## Usage | 用法 ### Configure | 設定 `configure`コマンドで, 取得したアプリケーションキーをtwarcに教えることができます. break twarc configure これにより, ホームディレクトリの`.twarc`というファイルに資格情報が保存されるため, 常に入力し続ける必要はありません. 直接指定したい場合は, 環境変数(`CONSUMER_KEY`, `CONSUMER_SECRET`, `ACCESS_TOKEN`, `ACCESS_TOKEN_SECRET`)か, コマンドラインオプション(`--consumer_key`, `--consumer_secret`, `--access_token`, `--access_token_secret`)を使用してください. ### Search | 検索 検索には, 与えられたクエリに適合する*既存の*ツイートをダウンロードするために, Twitterの[search/tweets](https://dev.twitter.com/rest/reference/get/search/tweets) APIを使います. twarc search blacklivesmatter > tweets.jsonl ここで重要なのは, `search`コマンドがTwitter検索APIの課す7日間以内の期限中から見つかったツイートを返すということです. もし期限が「短すぎる」と思うのなら(まあそれはそうですが), 以下の`filter`コマンドや`sample`コマンドを使って収集してみると面白いかもしれません. Twitterの検索構文についてよく知るためのベストプラクティスは, [Twitter's Advanced Search](https://twitter.com/search-advanced)で試してみて, 検索窓からクエリ文の結果をコピペすることです. 例えば以下の例は, `@deray`に送信された, ハッシュタグ`#blacklivesmatter`か`#blm`かの一方を含むツイートを検索する複雑なクエリです. twarc search '#blacklivesmatter OR #blm to:deray' > tweets.jsonl また, [Igor Brigadir](https://github.com/igorbrigadir)の*素晴らしい*Twitter検索構文のリファレンスを絶対にチェックしておくべきです.([Advanced Search on Twitter](https://github.com/igorbrigadir/twitter-advanced-search/blob/master/README.md)) 高度な検索フォームには, すぐにはみつからない隠れた宝石がたくさんあります. Twitterはツイートの言語をコーディングしようとします. [ISO 639-1]コードを使用すれば, 特定の言語に検索を制限できます. twarc search '#blacklivesmatter' --lang fr > tweets.jsonl 特定の場所でのツイートを検索することもできます. 例えば, ミズーリ州ファーガソンの中心から1マイルの`blacklivesmatter`に言及するツイートなどを検索できます. twarc search blacklivesmatter --geocode 38.7442,-90.3054,1mi > tweets.jsonl `--geocode`の使用時に検索クエリが提供されない場合, その場所と半径に関連する全てのツイートを返します. twarc search --geocode 38.7442,-90.3054,1mi > tweets.jsonl ### Filter | フィルター `filter`コマンドは, 呟かれたツイートを収集するために, Twitterの[statuses/filter](https://dev.twitter.com/streaming/reference/post/statuses/filter) APIを使います. twarc filter blacklivesmatter,blm > tweets.jsonl ここで注意すべきなのは, Twitterのトラッククエリの構文は, 検索APIのクエリとは少し異なるということです. そのため, 使用しているフィルターオプションの最も良い表現方法については, ドキュメントを参照してください. 特定のユーザーIDから呟かれたツイートを収集したい場合は, `follow`引数を使いましょう. これにはリツイートも含まれます. 例えば, これは`@CNN`のツイート及びリツイートを収集します. twarc filter --follow 759251 > tweets.jsonl 境界ボックス座標の数値(バウンディングボックス)を用いてツイートを収集することもできます. 注意: 先頭のダッシュ(`-`)はバウンディングボックス内ではエスケープする必要があります. エスケープしないと, コマンドライン引数として解釈されてしまいます! twarc filter --locations "\-74,40,-73,41" > tweets.jsonl `lang`コマンドライン引数を使用して, 検索を制限する[ISO 639-1]の言語コードを渡すことができます. フィルターストリームでは, 1つ以上の言語でフィルタリングできるため, 繰り返し可能です. 以下は, フランス語またはスペイン語で呟かれた, パリまたはマドリードに言及しているツイートを収集します. twarc filter paris,madrid --lang fr --lang es フィルタを組み合わせてオプションの後ろに続けた場合には, それらは共にORで結がれます. 例えば, これはハッシュタグ`#blacklivesmatter`または`#blm`を使用するツイート, 及びユーザー`@CNN`からのツイートを収集します. twarc filter blacklivesmatter,blm --follow 759251 > tweets.jsonl ただし, 場所と言語を組み合わせると, 結果的にANDになります. 例えば, これは, スペイン語またはフランス語で呟かれた, ニューヨークあたりからのツイートを収集します. twarc filter --locations "\-74,40,-73,41" --lang es --lang fr ### Sample | 抽出 `sample`コマンドは, Twitterの[statuses/sample](https://dev.twitter.com/streaming/reference/get/statuses/sample) APIに直近のパブリックステータスの「無作為な」抽出を尋ねるのに使えます. twarc sample > tweets.jsonl ### Dehydrate | デハイドレート `dehydrate`コマンドはツイートのJSONLファイルからツイートIDのリストを生成します. twarc dehydrate tweets.jsonl > tweet-ids.txt ### Hydrate | ハイドレート twarcの`hydrate`コマンドは, ツイートの識別子のファイルを読み込んで, Twitterの[status/lookup](https://dev.twitter.com/rest/reference/get/statuses/lookup) APIを用いてそれらのツイートのJSONを書き出します. twarc hydrate ids.txt > tweets.jsonl Twitter APIの[利用規約](https://dev.twitter.com/overview/terms/policy#6._Be_a_Good_Partner_to_Twitter)では, 人々が大量のTwitterの生データをWeb上で利用可能にすることを制限しています. - データは調査に使用したり, ローカルで使用するためにアーカイブしたりできますが, 世界と共有することはできません. - Twitterはツイートの識別子ファイルを共有することは許可しておらず, それはツイートのデータセットを利用可能にしたい場合に役立ちます. - それから, Twitter APIでデータを*ハイドレート*(注:水和)したり, またそれぞれの識別子のフルJSONデータを取得することは許可されています. - `hydrate`は特に, ソーシャルメディア研究を[検証](https://ja.wikipedia.org/wiki/再現性)する時に重要となります. ### Users | ユーザー `users`コマンドは, 与えられたスクリーンネームを持つユーザーのメタデータを返します. twarc users deray,Nettaaaaaaaa > users.jsonl またユーザーidも与えることができます. twarc users 1232134,1413213 > users.jsonl また, 望むなら以下のようにユーザーidのファイルを使用可能で, `followers`や`friends`といったコマンドを使っているときに有効です. twarc users ids.txt > users.jsonl ### Followers | フォロワー `followers`コマンドは, Twitterの[follower id API](https://dev.twitter.com/rest/reference/get/followers/ids)を用い, 引数として指定されたリクエストごとに1つだけのスクリーン名を持つユーザーのフォロワーのユーザーIDを収集します. twarc followers deray > follower_ids.txt 結果には, 行ごとに1つのユーザーIDが含まれ, その応答順序は逆時系列順, すなわち最新のフォロワーが初めに来ます. ### Friends | 友達 `followers`コマンドと同じく, `friends`コマンドはTwitterの[friend id API](https://dev.twitter.com/rest/reference/get/friends/ids)を用いて, 引数として指定されたリクエストごとに1つだけのスクリーン名を持つユーザーのフレンド(フォロー)ユーザーIDを収集します. twarc friends deray > friend_ids.txt ### Trends | トレンド 時に, 興味のあるトレンドの地域を示す[Where On Earth](https://web.archive.org/web/20180102203025/https://developer.yahoo.com/geo/geoplanet/)識別子(`WOE ID`)をオプションに与える必要があります. 例としてセントルイスの現在のトレンドを取得するやり方を示します. twarc trends 2486982 `WOE ID`に`1`を用いることで, 全世界のトレンドが取得されます. twarc trends 1 `WOE ID`として何を使用すればよいかわからない場合は, 以下のように`WOE ID`を省略することで, Twitterがトレンドを追跡している全ての場所のリストを取得できます. twarc trends Geolocationがあれば, `WOE ID`の代わりにジオロケーションを使用できます. twarc trends 39.9062,-79.4679 バックグラウンドでtwarcは, Twitterの[trends/closest](https://dev.twitter.com/rest/reference/get/trends/closest) APIを使用して, 場所を検索し, 最も近い`WOE ID`を見つけます. ### Timeline | タイムライン `timeline`コマンドは, Twitterの[user timeline API](https://dev.twitter.com/rest/reference/get/statuses/user_timeline)を用いて, スクリーンネームで示されるユーザーが投稿した最新のツイートを収集します. twarc timeline deray > tweets.jsonl また, ユーザーIDからユーザーを調べることもできます. twarc timeline 12345 > tweets.jsonl ### Retweets | リツイート 指定されたツイートIDのリツイートを以下のように取得できます. twarc retweets 824077910927691778 > retweets.jsonl ### Replies | 返信 残念ながら, TwitterのAPIは現在, ツイートへの返信の取得をサポートしていません. 代わりに, twarcは検索APIを使用してその機能の近似を行います. Twitterの検索APIは, 1週間以上前のツイートの取得をサポートしていません. そのため, twarcは先週までに送信されたツイートに対する返信のみを取得できます. 特定のツイートへの返信を取得したい場合は以下のようにします. twarc replies 824077910927691778 > replies.jsonl `--recursive`オプションを使用すると, 返信に対する返信や引用も取得されます. 検索APIによるレート制限のために, 長いスレッドの場合は完了するのに長時間かかる場合があります. twarc replies 824077910927691778 --recursive ### Lists | リスト リストにあるユーザを取得するには、`listmembers`コマンドで list URLを使用します。 twarc listmembers https://twitter.com/edsu/lists/bots ## Premium Search API Twitterでは、ツイートにTwitterのお金を支払うことができるプレミアム検索APIが導入されました。 [ダッシュボード](https://developer.twitter.com/en/dashboard)で環境設定をした後、 「Standard Search API」が提供する7日間のウィンドウ外で、30日間とフルアーカイブ でのエンドポイントを使ってツイートを検索することができます。コマンドラインから Premium APIを使用するには、使用しているエンドポイントと環境を指定する必要があります。 予算全体を使い果たすことを避けるために、`--to_date`と`--from_date`を使用して 時間範囲を制限することをおすすめします。また、`--limit`を使用して返される ツイートの最大数を制限することができます。 例えば、(今日が2020年6月1日だと仮定し)2週間前の全てのblacklivesmatterツイートを、 *docnowdev*という名前の環境を使って取得したいが、1000件以上のツイートを取得しない 場合は、次のような操作ができる。 twarc search blacklivesmatter \ --30day docnowdev \ --from_date 2020-05-01 \ --to_date 2020-05-14 \ --limit 1000 \ > tweets.jsonl 同様に、フルアーカイブを使用して2014年のツイートを検索するには、次の方法があります。 twarc search blacklivesmatter \ --fullarchive docnowdev \ --from_date 2014-08-04 \ --to_date 2014-08-05 \ --limit 1000 \ > tweets.jsonl 環境がサンドボックス化されている場合、twarcが一度に100件以上のツイートを要求しないように、 `--sandbox`を使用する必要があります。(サンドボックス化されていない環境のデフォルトは 500) twarc search blacklivesmatter \ --fullarchive docnowdev \ --from_date 2014-08-04 \ --to_date 2014-08-05 \ --limit 1000 \ --sandbox \ > tweets.jsonl ## Use as a Library | ライブラリとして使用 必要で応じてtwarcをプログラム的にライブラリとして使ってツイートを収集することができます。 最初に(Twitterの資格情報を使用して)twarcインスタンスを作成し、検索結果、フィルタ結果、 または検索結果の反復を処理するために使用できます。 ```python from twarc import Twarc t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) for tweet in t.search("ferguson"): print(tweet["text"]) ``` trackキーワードに一致する新しいツイートのフィルタストリームに対しても同じことができます。 ```python for tweet in t.filter(track="ferguson"): print(tweet["text"]) ``` また`location`なら, ```python for tweet in t.filter(locations="-74,40,-73,41"): print(tweet["text"]) ``` `user id`なら, ```python for tweet in t.filter(follow='12345,678910'): print(tweet["text"]) ``` 同様に, IDのリストまたはジェネレーターを渡すことで, ツイートIDをハイドレートできます. ```python for tweet in t.hydrate(open('ids.txt')): print(tweet["text"]) ``` ## User vs App Auth twarcはTwitterによるレート制限を管理しますが、 それらのレート制限は、認証方法によって 異なります。ユーザー認証とアプリ認証の2つのオプションがありますが、twarcは デフォルトでユーザー認証を使用するので、アプリ認証を使用するように指示することもできます。 アプリ認証への切り替えは、ツイートを検索するときなんかに便利です。ユーザー認証は 15分ごとに180件(1日あたり160万件)しかリクエストできないのに対し、アプリ認証は450件 (1日あたり430万件)のリクエストができるからです。 ただし注意すべきことは、ハイドレートサブコマンドで使用される`statuses / lookup` エンドポイントには、ユーザー認証の場合は15分あたり900件までリクエスト、アプリ 認証の場合は15分あたり300件までのリクエストのレート制限があるということです。 自分が何をしているかを知っていて、アプリ認証を強制したい場合は、次のように`--app_auth` コマンドラインオプションが使用できます。 twarc --app_auth search ferguson > tweets.jsonl 同様に、twarcをライブラリとして使用している場合は、次のことができます。 ```python from twarc import Twarc t = Twarc(app_auth=True) for tweet in t.search('ferguson'): print(tweet['id_str']) ``` ## Utilities | ユーティリティ `utils`ディレクトリには, line-oriented JSONを操作するための簡単なコマンドラインユーティリティがいくつかあります. 例えばアーカイブされたツイートをテキストまたはHTMLとして出力したり, ユーザー名や参照URLなどを抽出したりするものです. 便利なスクリプトを自作したら, 是非プルリクエストをください. いくつかツイートが手元にある時, それらを用いて初歩的なWallを作成できます. utils/wall.py tweets.jsonl > tweets.html `NASA`について収集したツイートのワードクラウドを作成できます. utils/wordcloud.py tweets.jsonl > wordcloud.html `replies`コマンドを用いていくつかのツイートを収集した場合, それらの静的な`D3.js`を用いたビジュアライゼーションを作成できます. utils/network.py tweets.jsonl tweets.html 必要に応じてユーザーごとにツイートを統合し, その中心のアカウントを表示できます. utils/network.py --users tweets.jsonl tweets.html [Gephi](https://gephi.org/)などのプログラムでネットワークグラフを使用する場合は, 次のようにGEXFファイルを生成できます. utils/network.py --users tweets.jsonl tweets.gexf `gender.py`は, 著者の性別に関する推測に基づいてツイートをフィルタリングできるフィルターです. 例えば, 女性からのもののように見えるすべてのツイートを除外し, それらの単語クラウドを作成できます. utils/gender.py --gender female tweets.jsonl | utils/wordcloud.py > tweets-female.html 地理座標が利用可能なツイートから[GeoJSON](http://geojson.org/)を出力できます. utils/geojson.py tweets.jsonl > tweets.geojson 必要に応じて, バウンディングボックスを置き換える重心を用いたGeoJSONをできます. utils/geojson.py tweets.jsonl --centroid > tweets.geojson また, 重心を用いたGeoJSONをエクスポートする場合に, ランダムファジングを追加することもできます. utils/geojson.py tweets.jsonl --centroid --fuzz 0.01 > tweets.geojson 地理座標の有無でツイートをフィルタリングするには, (場所については以下を参照:[API documentation](https://dev.twitter.com/overview/api/places)) utils/geofilter.py tweets.jsonl --yes-coordinates > tweets-with-geocoords.jsonl cat tweets.jsonl | utils/geofilter.py --no-place > tweets-with-no-place.jsonl GeoJSONのフェンスでツイートをフィルタリングするには, (要:[Shapely](https://github.com/Toblerity/Shapely)) utils/geofilter.py tweets.jsonl --fence limits.geojson > fenced-tweets.jsonl cat tweets.jsonl | utils/geofilter.py --fence limits.geojson > fenced-tweets.jsonl ツイートに重複があると思われる場合は, 重複の排除が可能です. utils/deduplicate.py tweets.jsonl > deduped.jsonl ID順ソートできます.これは, 時間順ソートに似ています. utils/sort_by_id.py tweets.jsonl > sorted.jsonl 特定の日付以前のすべてのツイートを除外できます. 例えば, 以下は関心のあるイベントの前, 別のイベントにハッシュタグが使用されていた場合です. utils/filter_date.py --mindate 1-may-2014 tweets.jsonl > filtered.jsonl 使用されているクライアントのHTMLリストを取得できます. utils/source.py tweets.jsonl > sources.html リツイートを削除する場合は, utils/noretweets.py tweets.jsonl > tweets_noretweets.jsonl またはURLの短縮を解除したい場合は, (要:[unshrtn](https://github.com/docnow/unshrtn)) cat tweets.jsonl | utils/unshrtn.py > unshortened.jsonl URLを短縮すると, 最もよくツイートされたURLのランキングリストを取得できます. cat unshortened.jsonl | utils/urls.py | sort | uniq -c | sort -nr > urls.txt ## twarc-report [twarc-report](https://github.com/pbinkley/twarc-report)プロジェクトでは, [D3.js](http://d3js.org/)でのビジュアライゼーションでの使用に適したCSVまたはJSONを生成・出力するユーティリティスクリプトを用意しています. 以前はtwarcの一部であった`directed.py`は`d3graph.py`としてtwarc-reportプロジェクトに移管しました. またそれぞれのスクリプトは, ビジュアライゼーションのHTMLでのデモを生成できます. 具体例として, - [タイムライン](https://www.wallandbinkley.com/twarc/bill10/) - [リツイートの有向グラフ](https://www.wallandbinkley.com/twarc/bill10/directed-retweets.html) があります. --- 翻訳クレジット: [Haruna] [英語]: https://github.com/DocNow/twarc/blob/main/README.md [ポルトガル語]: https://github.com/DocNow/twarc/blob/main/README_pt_br.md [スペイン語]: https://github.com/DocNow/twarc/blob/main/README_es_mx.md [スウェーデン語]: https://github.com/DocNow/twarc/blob/main/README_sv_se.md [スワヒリ語]: https://github.com/DocNow/twarc/blob/main/README_sw_ke.md [ISO 639-1]: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes [Haruna]: https://github.com/eggplants ================================================ FILE: docs/twarc1_pt_br.md ================================================ twarc1 ===== twarc é uma ferramenta de linha de comando e usa a biblioteca Python para arquivamento de dados do Twitter com JSON. Cada tweet será representado como um objeto JSON [exatamente](https://dev.twitter.com/overview/api/tweets) o que foi devolvido pela API do Twitter. Os Tweets serão armazenados como [JSON, um por linha](https://en.wikipedia.org/wiki/JSON_Streaming#Line-delimited_JSON). twarc controla totalmente a API [limites de uso](https://dev.twitter.com/rest/public/rate-limiting) para você. Além de permitir que você colete Tweets, twarc também pode ajudá-lo Coletar usuários, tendências e hidratar tweet ids. twarc Foi desenvolvido como parte [Documenting the Now](http://www.docnow.io) Projecto financiado pelo [Mellon Foundation](https://mellon.org/). ## Instalação Antes de usar twarc você precisa registrar um aplicativo em [apps.twitter.com](http://apps.twitter.com). Depois de criar o aplicativo, anote a [consumer_key], [consumer_secret] e clique em Gerar um [access_token] e um [access_token_secret]. Com estas quatro variáveis na mão você está pronto para começar a usar twarc. OBS: Se tiver alguma dúvida de como criar o aplicativo, consulte [como criar um app](http://blog.difluir.com/2013/06/como-criar-uma-app-no-twitter/) 1. instalação [Python](http://python.org/download) (2 ou 3) 2. pip install twarc ### Homebrew (macOS apenas) Para usuários do macOS, você pode instalar o `twarc` via: ```bash $ brew install twarc ``` ## Início Rápido: Primeiro você vai precisar configurar o twarc mostrando a ele suas chaves de API: twarc configure Em seguida, experimente uma pesquisa rápida: twarc search blacklivesmatter > search.jsonl Ou talvez você gostaria de coletar tweets como eles acontecem? twarc filter blacklivesmatter > stream.jsonl Veja abaixo os detalhes sobre esses comandos e muito mais. ## Uso ### Configurar Uma vez que você tem suas chaves de aplicativo, você pode dizer ao twarc quais são com o comando `configure`. twarc configure Isso irá armazenar as credenciais em um arquivo chamado `.twarc` em seu diretório home. Este arquivo será usado como padrão em outras chamadas. Se preferir, você pode fornecer diretamente as chaves (`CONSUMER_KEY`, `CONSUMER_SECRET`, `ACCESS_TOKEN`, `ACCESS_TOKEN_SECRET`) ou usando a linha de comando com as opções (`--consumer_key`, `--consumer_secret`, `--access_token`, `--access_token_secret`). ### Pesquisar Os usuários do Twitter [Pesquisar/tweets](https://dev.twitter.com/rest/reference/get/search/tweets) para baixar *pre-existing* tweets, correspondendo a uma determinada consulta que desejar. twarc search blacklivesmatter > tweets.jsonl É importante notar que `search` Irá retornar tweets encontrados dentro de uma Janela de 7 dias imposta pela API de pesquisa do Twitter. Se isso parece uma pequena Janela,e é, mas você pode estar interessado em coletar tweets como eles acontecem Usando o `filter` e `sample` comandos abaixo. A melhor maneira de se familiarizar com a sintaxe de pesquisa do Twitter é experimentando [Pesquisa Avançada do Twitter](https://twitter.com/search-advanced) E copiar e colar a consulta resultante da caixa de pesquisa. Por exemplo, aqui está uma consulta complicada que procura por tweets que contenham \#blacklivesmatter ou #blm hashtags que foram enviados para deray. twarc search '#blacklivesmatter OR #blm to:deray' > tweets.jsonl Você definitivamente também deve consultar o *excelente* guia de referência de Igor Brigadir à sintaxe de busca do Twitter: [Busca Avançada no Twitter](https://github.com/igorbrigadir/twitter-advanced-search/blob/master/README.md) Lá existem várias pérolas escondidas que não estão muito evidentes no formulário de pesquisa avançada. O Twitter tenta codificar a linguagem de um tweet e você pode limitar sua pesquisa para um idioma específico se quiser usando um código [ISO 639-1]: twarc search '#forabolsonaro' --lang pt > tweets.jsonl Você também pode pesquisar tweets com um determinado local, por exemplo tweets Mencionando *foratemer* das pessoas situadas a 1 milha na região de Brasília: twarc search foratemer --geocode -16.050561,-47.814708,1mi > tweets.jsonl Se uma consulta de pesquisa não for fornecida`--geocode` Você receberá todos os tweets Relevantes para esse local e raio: twarc search --geocode -16.050561,-47.814708,1mi > tweets.jsonl ### Filter O comando `filter` Vai usar o Twitter [statuses/filter](https://dev.twitter.com/streaming/reference/post/statuses/filter) API to collect tweets as they happen. twarc filter foratemer,blm > tweets.jsonl Observe que a sintaxe para consultas de queries do Twitter é ligeiramente diferente do que as consultas em sua API de pesquisa. Por favor, consulte a documentação sobre a melhor forma de expressar a opção de filtro que você deseja. Use o comando de linha `follow` com argumento se você quer coletar tweets de um determinado ID de usuário. Isso inclui retweets. Por exemplo, isso vai coletar tweets e os retweets da CNN: twarc filter --follow 759251 > tweets.jsonl Você também pode coletar tweets usando uma caixa delimitadora. Nota: o traço principal precisa ser escapado na caixa delimitadora ou então ele será interpretado como um comando de linha como argumento! Exemplo: escapando com a barra invertida após aspas "\ twarc filter --locations "\-74,40,-73,41" > tweets.jsonl Se você combinar opções eles serão um OU outro juntos. Por exemplo, isso irá coletar Tweets que usam o hashtags foratemer OU blm e também tweets do usuário CNN: twarc filter blacklivesmatter,blm --follow 759251 > tweets.jsonl Mas combinar locais e idiomas resultará efetivamente em um E. Para exemplo, isso irá coletar tweets da grande área de Nova York que estão em Espanhol ou francês: twarc filter --locations "\-74,40,-73,41" --lang es --lang fr ### Sample Use o comando `sample` para ouvir/Status do Twitter [statuses/sample](https://dev.twitter.com/streaming/reference/get/statuses/sample) API para uma amostra "aleatória/ramdom" de tweets públicos recentes. O status será do usuário ativo na API twarc. twarc sample > tweets.jsonl ### Dehydrate O comando `dehydrate` gera uma lista de id de um arquivo de tweets: twarc dehydrate tweets.jsonl > tweet-ids.txt ### Hydrate O comando do twarc `hydrate` Lê um arquivo de IDs de tweets e escreve o tweet em JSON para eles usando Twitter [status/lookup](https://dev.twitter.com/rest/reference/get/statuses/lookup) API. twarc hydrate ids.txt > tweets.jsonl O [Termos do Serviço](https://dev.twitter.com/overview/terms/policy#6._Be_a_Good_Partner_to_Twitter) do Twitter API's desencoraja pessoas na busca de grandes quantidades de dados brutos do Twitter e disponíbilizar na Web. Os dados podem ser usados para pesquisa e arquivados para uso local, mas não devem ser compartilhados com o mundo. O Twitter permite que arquivos de identificadores de tweet sejam compartilhados, o que pode ser útil quando você quer fazer um conjunto de dados de tweets disponíveis. Você pode usar a API do Twitter para *hydrate* dados ou para recuperar o JSON completo para cada identificador/usuário ID. Isto é particularmente importante para [verificação](https://en.wikipedia.org/wiki/Reproducibility) da rede social mundial. ### Usuários O comando `users` retorna metadados do usuário fornecidos na tela,exemplo: twarc users deray,Nettaaaaaaaa > users.jsonl Você também pode usar os ids do usuário: twarc users 1232134,1413213 > users.jsonl Se você quiser, você também pode usar um arquivo com ids de usuário, o que pode ser útil se você estiver usando o `followers` e o `friends` conforme comando abaixo: twarc users ids.txt > users.jsonl ### Seguidores (Quem me segue) O comando `followers` Vai usar o Twitter [API seguidores ID](https://dev.twitter.com/rest/reference/get/followers/ids) Para coletar os ids dos usuários que estão seguindo exatamente o nome informado na tela. Veja como é feita a solicitação usando o nome do user como argumento: twarc followers deray > follower_ids.txt O resultado incluirá exatamente um ID de usuário por linha. A ordem de resposta é Invertida cronológicamente, o mais recente seguidores em primeiro lugar. ### Amigos (Quem eu sigo) Igual o comando `followers`, o comando` friends` usará o Twitter [API amigos ID](https://dev.twitter.com/rest/reference/get/friends/ids) Para coletar os IDs de usuário amigo/friends com o nome que foi informado na tela no momento da solicitação,conforme especificado abaixo no argumento: twarc friends deray > friend_ids.txt ### Trends / tendências O comando `trends` permite recuperar informações da API do Twitter sobre hashtags tendências. Você precisa fornecer um [Onde na Terra](http://developer.yahoo.com/geo/geoplanet/) identificador (`woeid`) para indicar quais as tendências que você está interessado. Por exemplo, aqui é como você pode obter as tendências atuais para St Louis: twarc trends 2486982 Usando um `woeid` de 1 irá retornar tendências para todo o planeta, ou trends mundiais: twarc trends 1 Se você não tem certeza do que usar como um "woeid", não se preocupe, apenas omita seu valor e você receberá uma lista de todos os lugares para os quais o Twitter acompanha as tendências: twarc trends Se você já tem uma [geo-location/geo localização], você pode usar diretamente no seu `woedid`. twarc trends 39.9062,-79.4679 Por trás das cenas, o twarc buscará o local usando o Twitter [trends/closest](https://dev.twitter.com/rest/reference/get/trends/closest) API para encontrar a `woeid`. ### Timeline O comando timeline usará do Twitter [API user timeline](https://dev.twitter.com/rest/reference/get/statuses/user_timeline) Para coletar os tweets mais recentes postados pelo usuário indicado por um screen_name. twarc timeline deray > tweets.jsonl Você também pode procurar usuários usando um id de usuário: twarc timeline 12345 > tweets.jsonl ### Retuítes Você pode obter retuítes para um determinado id de tweet como este: twarc retweets 824077910927691778 > retweets.jsonl Se você tiver tweet_ids para os quais gostaria de buscar os retuítes, você pode: twarc retweets ids.txt > retweets.jsonl ### Repostas Infelizmente, a API do Twitter não suporta atualmente a obtenção de respostas para um tweet. Portanto, o twarc o aproxima usando a API de pesquisa. Como a API de pesquisa não suporta a obtenção de tweets com mais de uma semana, o twarc só pode obter todas as respostas a um tweet que foram enviadas na última semana. Se você deseja obter respostas para um determinado tweet, você pode: twarc replies 824077910927691778 > replies.jsonl Usar a opção `--recursive` também irá buscar respostas para as respostas, bem como citações. Isso pode levar muito tempo para ser concluído em um thread grande por causa de limitação de taxa pela API de pesquisa. twarc replies 824077910927691778 --recursive ### Listas Para obter os usuários que estão em uma lista, você pode usar o URL da lista com o comando `listmembers`: twarc listmembers https://twitter.com/edsu/lists/bots ## Premium Search API O Twitter introduziu uma API de pesquisa premium que permite que você pague dinheiro ao Twitter por tweets. Depois de configurar um ambiente em seu [painel] (https://developer.twitter.com/en/dashboard) você pode usar seus 30 dias e endpoints fullarchive para pesquisar tweets fora da janela de 7 dias fornecida pela API de pesquisa padrão. Para usar a API premium na linha de comando, você precisará indicar qual terminal você está usando e o ambiente. Para evitar usar todo o seu orçamento, você provavelmente desejará limitar o intervalo de tempo usando `--to_date` e` --from_date`. Além disso, você pode limitar o número máximo de tweets retornados usando `--limit`. Por exemplo, se eu quisesse obter todos os tweets blacklivesmatter de um semanas atrás (supondo que hoje seja 1 de Junho de 2020) usando meu ambiente chamado *docnowdev*, mas não recuperando mais de 1000 tweets, eu poderia: twarc search blacklivesmatter \ --30day docnowdev \ --from_date 2020-05-01 \ --to_date 2020-05-14 \ --limit 1000 \ > tweets.jsonl Da mesma forma, para encontrar tweets de 2014 usando o arquivo completo, você pode: twarc search blacklivesmatter \ --fullarchive docnowdev \ --from_date 2014-08-04 \ --to_date 2014-08-05 \ --limit 1000 \ > tweets.jsonl Se o seu ambiente for sandbox, você precisará usar `--sandbox` para que o twarc saiba que não deve solicitar mais de 100 tweets por vez (o padrão para ambientes sem sandbox é 500) twarc search blacklivesmatter \ --fullarchive docnowdev \ --from_date 2014-08-04 \ --to_date 2014-08-05 \ --limit 1000 \ --sandbox \ > tweets.jsonl ## Usar twarc como uma biblioteca Se você quiser pode usar `twarc` programaticamente como uma biblioteca para coletar Tweets. Primeiro você precisa criar uma instância do `twarc` (usando as suas Credenciais do Twitter) e, em seguida, usá-lo para iterar através de resultados de pesquisa ou filtrar resultados de pesquisa. ```python from twarc import Twarc t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) for tweet in t.search("ferguson"): print(tweet["text"]) ``` Você pode fazer o mesmo para um fluxo de filtro de novos tweets que correspondem a uma determinada faixa usando palavra-chave. ```python for tweet in t.filter(track="ferguson"): print(tweet["text"]) ``` ou localização: ```python for tweet in t.filter(locations="-74,40,-73,41"): print(tweet["text"]) ``` ou IDS do usuário: ```python for tweet in t.filter(follow='12345,678910'): print(tweet["text"]) ``` Da mesma forma você pode hidratar os identificadores de tweet passando em uma lista de ids ou um gerador: ```python for tweet in t.hydrate(open('ids.txt')): print(tweet["text"]) ``` ## User x App Auth twarc gerenciará a limitação de taxas pelo Twitter. No entanto, você deve saber que a limitação de taxa varia de acordo com a maneira como você autentica. As duas opções são User Auth e App Auth. O padrão do twarc é usar a autenticação do usuário, mas você pode dizer a ele para usar o App Auth. Mudar para App Auth pode ser útil em algumas situações, como quando você está pesquisando tweets, já que o User Auth só pode emitir 180 solicitações a cada 15 minutos (1,6 milhões de tweets por dia), mas o App Auth pode emitir 450 (4, 3 milhões de tweets por dia). Mas tenha cuidado: o endpoint `statuses / lookup` usado pelo subcomando hydrate tem um limite de taxa de 900 solicitações por 15 minutos para autenticação do usuário e 300 solicitações por 15 minutos para App Auth. Se você sabe o que está fazendo e deseja forçar o App Auth, pode usar o opção de linha de comando `--app_auth`: twarc --app_auth search ferguson > tweets.jsonl Da mesma forma, se você estiver usando twarc como uma biblioteca, você pode: ```python from twarc import Twarc t = Twarc(app_auth=True) for tweet in t.search('ferguson'): print(tweet['id_str']) ``` ## Utilitários No diretório utils existem alguns utilitários via linha de comando simples para Trabalhar com o JSON gravando linha por por linha, tais como. - Imprimir os tweets arquivados como Texto ou html. - Extraindo os nomes de usuários. - URLs referenciadas. - Etc. Se você criar um Script e achar útil, por favor envie um pedido de pull no github do projeto. Quando você tem alguns tweets você pode criar um paralelo rudimentar deles: utils/wall.py tweets.jsonl > tweets.html Você pode criar uma nuvem de palavras de tweets coletados sobre a nasa: utils/wordcloud.py tweets.jsonl > wordcloud.html Se você coletou alguns tweets usando `respostas`, você pode criar uma visualização estática D3 deles com: utils/network.py tweets.jsonl tweets.html Opcionalmente, você pode consolidar tweets por usuário, permitindo que você veja contas centrais: utils/network.py --users tweets.jsonl tweets.html Além disso, você pode criar uma rede de hashtags, permitindo que você visualize sua alocação: utils/network.py --hashtags tweets.jsonl tweets.html E se você quiser usar o gráfico de rede em um programa como [Gephi] (https://gephi.org/), você pode gerar um arquivo GEXF com o seguinte: utils/network.py --users tweets.jsonl tweets.gexf utils/network.py --hashtags tweets.jsonl tweets.gexf gender.py É um filtro que permite filtrar tweets com base em um palpite sobre o gênero do autor. Assim, por exemplo, você pode filtrar todos os tweets que em tese foram feitos por mulheres, e criar uma nuvem de palavras para eles: utils/gender.py --gender female tweets.jsonl | utils/wordcloud.py > tweets-female.html Você pode com [GeoJSON](http://geojson.org/) ver os tweets de determinadas coordenadas geográficas: utils/geojson.py tweets.jsonl > tweets.geojson Opcionalmente você pode exportar GeoJSON com centróides substituindo as caixas delimitadoras: utils/geojson.py tweets.jsonl --centroid > tweets.geojson E se você exportar GeoJSON com centróides, você pode adicionar alguns fuzzing aleatórios: utils/geojson.py tweets.jsonl --centroid --fuzz 0.01 > tweets.geojson Para filtrar tweets pela presença ou ausência de coordenadas geográficas (Ou Local, veja [Documentação da API locais](https://dev.twitter.com/overview/api/places)): utils/geofilter.py tweets.jsonl --yes-coordinates > tweets-with-geocoords.jsonl cat tweets.jsonl | utils/geofilter.py --no-place > tweets-with-no-place.jsonl Para filtrar tweets por uma área com GeoJSON (Requer [Shapely](https://github.com/Toblerity/Shapely)): utils/geofilter.py tweets.jsonl --fence limits.geojson > fenced-tweets.jsonl cat tweets.jsonl | utils/geofilter.py --fence limits.geojson > fenced-tweets.jsonl Se você suspeitar ter duplicado seus tweets, você pode remove-los: utils/deduplicate.py tweets.jsonl > deduped.jsonl Você pode classificar por ID, o que é análogo à classificação por tempo: utils/sort_by_id.py tweets.jsonl > sorted.jsonl Você pode filtrar todos os tweets antes de uma determinada data (por exemplo, se uma hashtag foi usada para outro evento antes do que você está interessado): utils/filter_date.py --mindate 1-may-2014 tweets.jsonl > filtered.jsonl Você pode obter uma lista HTML dos usuários usados: utils/source.py tweets.jsonl > sources.html Se você quiser remover os retweets: utils/noretweets.py tweets.jsonl > tweets_noretweets.jsonl Ou unshorten urls (Requer [unshrtn](https://github.com/edsu/unshrtn)): cat tweets.jsonl | utils/unshorten.py > unshortened.jsonl Depois de desfazer masca de seus URLs, você pode obter uma lista classificada dos URLs mais tweeted: cat unshortened.jsonl | utils/urls.py | sort | uniq -c | sort -nr > urls.txt ## twarc-report Alguns scripts de utilitários adicionais para gerar saída csv ou json adequada foi feito com [D3.js](http://d3js.org/) Visualizações são encontradas [twarc-report](https://github.com/pbinkley/twarc-report) projeto. O Util direct.py, anteriormente parte do twarc, mudou-se para twarc-report como d3graph.py. Cada script também pode gerar uma demo html de uma visualização D3, e.g. [timelines](https://wallandbinkley.com/twarc/bill10/) or a [directed graph of retweets](https://wallandbinkley.com/twarc/bill10/directed-retweets.html). --- Tradução créditos: [Wilson Jr] [Espanhol]: https://github.com/DocNow/twarc/blob/main/README_es_mx.md [Inglês]: https://github.com/DocNow/twarc/blob/main/README.md [Japonês]: https://github.com/DocNow/twarc/blob/main/README_ja_jp.md [Sueco]: https://github.com/DocNow/twarc/blob/main/README_sv_se.md [Suaíli]: https://github.com/DocNow/twarc/blob/main/README_sw_ke.md [ISO 639-1]: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes [Wilson Jr]: https://github.com/py3in ================================================ FILE: docs/twarc1_sv_se.md ================================================ twarc1 ===== twarc är ett kommandoradsverktyg twarc och ett Pythonbibliotek för arkivering av Twitter JSON data. Varje tweet är representerat som ett JSON-objekt som är [exakt](https://dev.twitter.com/overview/api/tweets) vad som returneras från Twitters API Tweets lagras som [line-oriented JSON](https://en.wikipedia.org/wiki/JSON_Streaming#Line-delimited_JSON). twarc hanterar Twitter API:ets [rate limits](https://dev.twitter.com/rest/public/rate-limiting) åt dig. Förutom att kunna samla in tweets kan även twarc hjälpa dig att samla in användare, trender och omvandla tweet-id:n till tweets. twarc har utvecklats som en del av [Documenting the Now](http://www.docnow.io) projektet som finiansierades av [Mellon Foundation](https://mellon.org/). ## Installera Innan du använder twarc behöver du registrera en applikation hos [apps.twitter.com](http://apps.twitter.com). När du har skapat din applikation, skriv ner consumer key, consumer secret och klicka för att generera en access token och en access token secret. Med dessa fyra variabler är du redo att börja använda twarc. 1. Installera [Python](http://python.org/download) (2 eller 3) 2. pip install twarc (om du uppgraderar: pip install --upgrade twarc) ## Snabbstart: Först måste du tala om för twarc vad dina API-nycklar är och tillåta åtkomst till ett eller flera twitterkonton: twarc configure Prova att köra: twarc search blacklivesmatter > search.jsonl Eller om du vill samla in tweets i samma ögonblick de skapas: twarc filter blacklivesmatter > stream.jsonl Se nedan för detaljer om dessa och fler kommandon. ## Användning ### Konfigurera När du har dina applikationsnycklar så kan du tala om för twarc vilka de är med `configure` kommandot. twarc configure Detta kommer att lagra dina nycklar i en fil som heter `.twarc` placerad i din hemkatalog så du slipper att skriva in dem varje gång. Om du hellre vill tilldela dom direkt så kan du göra det i environment (`CONSUMER_KEY`, `CONSUMER_SECRET`, `ACCESS_TOKEN`, `ACCESS_TOKEN_SECRET`) eller genom att använda kommandoradsparameter options (`--consumer_key`, `--consumer_secret`, `--access_token`, `--access_token_secret`). ### Sök Detta använder Twitters [search/tweets](https://dev.twitter.com/rest/reference/get/search/tweets) för att ladda ner *redan befintliga* tweets som matchar en given söksträng. twarc search blacklivesmatter > tweets.jsonl Det är viktigt att notera att `search` retunerar tweets som hittas inom det 7-dagarsfönster som Twitters sök-API erbjuder. Känns det som ett smalt fönster? Det är det. Men du kanske är intresserad av att samla in tweets i samma ögonblick som de skapas genom att använda `filter` och `sample` kommandona nedan. Det bästa sättet att bekanta sig med Twitters söksyntax är att experimentera med [Twitters Advancerade Sök](https://twitter.com/search-advanced) och kopiera och klistra in söksträngen från sökboxen. Här är till exempel en mer avancerad söksträng som matchar tweets innehållande antingen \#blacklivesmatter eller #blm hashtaggar som skickats till deray twarc search '#blacklivesmatter OR #blm to:deray' > tweets.jsonl Twitter försöker att koda en tweets språk, och du kan begränsa sökningen till ett specifikt språk om du vill: twarc search '#blacklivesmatter' --lang fr > tweets.jsonl Du kan också söka efter tweets inom en given plats, till exempel tweets som nämner *blacklivesmatter* som är 1 mile från centrala Ferguson, Missouri: twarc search blacklivesmatter --geocode 38.7442,-90.3054,1mi > tweets.jsonl Om inte en söksträng ges när du använder `--geocode` kommer du få alla tweets som är relevanta för den platsen och radien. twarc search --geocode 38.7442,-90.3054,1mi > tweets.jsonl ### Filter `filter` Kommandot använder Twitters [statuses/filter](https://dev.twitter.com/streaming/reference/post/statuses/filter) API för att samla in tweets i samma ögonblick som de skapas. twarc filter blacklivesmatter,blm > tweets.jsonl Notera att syntaxen för Twitters track söksträngar är något annorlunda än de som används i sök-API:et Var god läs dokumentationen för att se hur du bäst kan formulera sökningar. Använd `follow` kommandot om du vill samla in tweets från ett specifikt användar-id i samma ögonblick som de skapas. Detta inkluderar retweets. Till exempel så samlar detta in tweets och retweets från CNN: twarc filter --follow 759251 > tweets.jsonl Du kan också samla in tweets genom att använda koordinater. Notera: det inledande bindestrecket behöver ignoreras, annars kommer det tolkas som en kommandoradsparameter! twarc filter --locations "\-74,40,-73,41" > tweets.jsonl Om du kombinerar parametrar så kommer de tolkas som OR Till exempel så kommer detta samla in tweets som använder blacklivesmatter eller blm hashtaggen och som också postats av användaren CNN: twarc filter blacklivesmatter,blm --follow 759251 > tweets.jsonl ### Sample Använd `sample` kommandot för att "lyssna" på Twitters [statuses/sample](https://dev.twitter.com/streaming/reference/get/statuses/sample) API för ett "slumpmässigt" prov av nyligen skapade publika tweets. twarc sample > tweets.jsonl ### Dehydrering `dehydrate` kommandot genererar en lista med identifierare från en fil med tweets: twarc dehydrate tweets.jsonl > tweet-ids.txt ### Hydrering twarc's `hydrate` kommando läser en fil med tweetidentifierare och skriver ut som tweet JSON genom Twitters [status/lookup](https://dev.twitter.com/rest/reference/get/statuses/lookup) API. twarc hydrate ids.txt > tweets.jsonl Twitter APIs [Terms of Service](https://dev.twitter.com/overview/terms/policy#6._Be_a_Good_Partner_to_Twitter) uppmuntrar inte folk att tillgängliggöra stora mängder av rå Twitterdata på webben. Datan kan användas för forskning och arkiveras lokalt, men kan inte delas med världen. Twitter tillåter emellertid att identifierare delas, vilket kan vara bra när du vill tillgängliggöra ett dataset. Du kan då använda Twitters API för att *hydrera* datan, eller för att hämta den fulla JSON-objektet för varje identifierare. Detta är särskilt viktigt för [verifiering](https://en.wikipedia.org/wiki/Reproducibility) av forskning på social media. ### Användare `users` kommandot retunerar metadata för angivna screen names. twarc users deray,Nettaaaaaaaa > users.jsonl Du kan också använda användar-id: twarc users 1232134,1413213 > users.jsonl Om du vill kan du också använda en fil med användar-id, vilket kan vara användbart om du använder `followers` och `friends` kommandona nedan: twarc users ids.txt > users.jsonl ### Följare `followers` kommandot använder Twitters [follower id API](https://dev.twitter.com/rest/reference/get/followers/ids) för att samla in följarens användar-id för exakt ett screen name per request specificerat som ett argument: twarc followers deray > follower_ids.txt Resultatet inkluderar exakt ett användar-id per linje ordnat i omvänd kronologisk ordning, alltså de senaste följarna först. ### Vänner Precis som `followers` kommandot, använder `friends` kommandot Twitters [friend id API](https://dev.twitter.com/rest/reference/get/friends/ids) för att samla in vänners användar-id för exakt ett screen name per request, specificerat som ett argument: twarc friends deray > friend_ids.txt ### Trender `trends` kommandot låter dig hämta information från Twitters API om trendande hashtags. Du måste bifoga en [Where On Earth](http://developer.yahoo.com/geo/geoplanet/) identifierare (`woeid`) för att precisera vilka trender du är intresserad av. Till exempel kan du hämta de senaste trenderna för St. Louis på det hör viset: twarc trends 2486982 Använder du ett `woeid` på 1 så kommer du få trender för hela världen: twarc trends 1 Om du inte är säker på vad du ska använda för `woeid` så kan du helt enkelt utesluta det för att få en lista över alla platser Twitter har trender för: twarc trends Om du har en geo-position så kan du använda den istället för `woeid`. twarc trends 39.9062,-79.4679 Bakom kulisserna så hjälper twarc dig genom Twitters [trends/closest](https://dev.twitter.com/rest/reference/get/trends/closest) API att hitta närmaste `woeid`. ### Tidslinje `timeline` kommandot använder Twitters [user timeline API](https://dev.twitter.com/rest/reference/get/statuses/user_timeline) för att samla in de senaste tweetsen skapade av en användare baserat på screen_name. twarc timeline deray > tweets.jsonl Du kan också använda användar-id: twarc timeline 12345 > tweets.jsonl ### Retweets Du kan samla in retweets för ett givet tweetid genom: twarc retweets 824077910927691778 > retweets.jsonl ### Svar Tyvärr så stödjer inte Twitters API att hämta svar till en tweet. twarc använder istället sök-API:et för detta. Då sök-API:et inte kan användas för att samla in tweets äldre än en vecka kan twarc endast hämta alla svar till en tweet som har postats den senaste veckan. Om du vill hämta svaren till en tweet så kan du använda följande: twarc replies 824077910927691778 > replies.jsonl Genom att använda `--recursive` parametern så hämtas även svar till svar så väl som citerade tweets. Detta kan ta mycket lång tid att köra på stora trådar på grund av rate limiting på sök-API:et. twarc replies 824077910927691778 --recursive ### Listor För att hämta användare som är med på en lista kan du använda list-URL:en med `listmembers` kommandot: twarc listmembers https://twitter.com/edsu/lists/bots ## Använd som ett bibliotek Du kan också använda twarc programatiskt som ett bibliotek för att samla in tweets. Du behöver först skapa en instans av `twarc` (genom att använda dina nycklar) , och sedan använda det för att iterera genom sökresultat, filter och resultat. ```python from twarc import Twarc t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) for tweet in t.search("ferguson"): print(tweet["text"]) ``` Du kan göra samma sak för en ström som matchar ett nyckelord ```python for tweet in t.filter(track="ferguson"): print(tweet["text"]) ``` eller en position: ```python for tweet in t.filter(locations="-74,40,-73,41"): print(tweet["text"]) ``` eller användar-id: ```python for tweet in t.filter(follow='12345,678910'): print(tweet["text"]) ``` På samma sätt kan du hydrera tweetid:n genom att bearbeta en lista med idn eller en generator: ```python for tweet in t.hydrate(open('ids.txt')): print(tweet["text"]) ``` ## Verktyg I utils-mappen finns ett antal enkla kommandoradsverktyg för att bearbeta linjeorienterad JSON, så som att skriva ut arkiverade tweets som text eller html, extrahera användarnamn, refererade url:er, m.m. Om du skapar ett skript som du tycker är bra så får du gärna skicka en pull request. När du samlat in lite tweets kan du skapa en rudimentär vägg av dem: % utils/wall.py tweets.jsonl > tweets.html Du kan skapa ett ordmoln baserat på tweets du samlat in: % utils/wordcloud.py tweets.jsonl > wordcloud.html Om du har samlat in tweets genom att använda `replies` kan du skapa en statisk D3 visualisering av dem med: % utils/network.py tweets.jsonl tweets.html Du kan även slå samman tweets per användare, vilket gör att du kan se centrala konton. % utils/network.py --users tweets.jsonl tweets.html Och om du vill använda nätverksgrafen i ett program som [Gephi](https://gephi.org/), så kan du generera en GEXF-fil med följande: % utils/network.py --users tweets.jsonl tweets.gexf gender.py är ett filter som låter dig filtrera tweets baserat på en gissining författarens kön. Till exempel kan du filtrera ut alla tweets som ser ut som de var skrivna av kvinnor och skapa ett ordmoln: % utils/gender.py --gender female tweets.jsonl | utils/wordcloud.py > tweets-female.html Du kan få ut [GeoJSON](http://geojson.org/) från tweets där geo-koordinater finns tillgängliga: % utils/geojson.py tweets.jsonl > tweets.geojson Alternativt kan du exportera GeoJSON med centroider som ersättning för bounding boxes: % utils/geojson.py tweets.jsonl --centroid > tweets.geojson Och om du exporterar GeoJSON med centroider, så kan du lägga till lite slumpmässig fuzz: % utils/geojson.py tweets.jsonl --centroid --fuzz 0.01 > tweets.geojson För att filtrera tweets baserat på tillgänglighet av geo-koordinater (eller plats, se [API documentation](https://dev.twitter.com/overview/api/places)): % utils/geofilter.py tweets.jsonl --yes-coordinates > tweets-with-geocoords.jsonl % cat tweets.jsonl | utils/geofilter.py --no-place > tweets-with-no-place.jsonl För att filtrera tweets genom ett GeoJSON-staket (Kräver [Shapely](https://github.com/Toblerity/Shapely)): % utils/geofilter.py tweets.jsonl --fence limits.geojson > fenced-tweets.jsonl % cat tweets.jsonl | utils/geofilter.py --fence limits.geojson > fenced-tweets.jsonl Om du misstänker att du har duplikat i dina tweetinsamlingar kan du ta bort duplikaten: % utils/deduplicate.py tweets.jsonl > deduped.jsonl Du kan sortera efter ID, vilket är samma sak som att sortera efter tid. % utils/sort_by_id.py tweets.jsonl > sorted.jsonl Du kan filtrera bort alla tweets före ett specifikt datum (till exempel, om en hashtag användes för en annan händelse före det du är intresserad av): % utils/filter_date.py --mindate 1-may-2014 tweets.jsonl > filtered.jsonl Du kan få en lista i HTML över vilka klienter som använts: % utils/source.py tweets.jsonl > sources.html Om du vill ta bort retweets: % utils/noretweets.py tweets.jsonl > tweets_noretweets.jsonl Eller lösa förkortade url:er (kräver [unshrtn](https://github.com/edsu/unshrtn)): % cat tweets.jsonl | utils/unshorten.py > unshortened.jsonl När du har löst de förkortade url:erna kan du få en ranklista över de mest tweetade url:erna: % cat unshortened.jsonl | utils/urls.py | sort | uniq -c | sort -nr > urls.txt ## twarc-report Ytterligare verktyg för att generera CSV-filer eller json lämpad för att använda med [D3.js](http://d3js.org/) visualiseringar kan du hitta i [twarc-report](https://github.com/pbinkley/twarc-report) projektet. Verktyget `directed.py`, tidigare en del av twarc, har flyttat till twarc-report som `d3graph.py`. Varje skript kan också generera en html-demo av en D3 visualisering, t.ex. [timelines](https://wallandbinkley.com/twarc/bill10/) eller en [riktad graf av retweets](https://wallandbinkley.com/twarc/bill10/directed-retweets.html). Översättning: [Andreas Segerberg] [Engelska]: https://github.com/DocNow/twarc/blob/main/README.md [Japanska]: https://github.com/DocNow/twarc/blob/main/README_ja_jp.md [Portugisiska]: https://github.com/DocNow/twarc/blob/main/README_pt_br.md [Spanska]: https://github.com/DocNow/twarc/blob/main/README_es_mx.md [Swahili]: https://github.com/DocNow/twarc/blob/main/README_sw_ke.md [Andreas Segerberg]: https://github.com/Segerberg ================================================ FILE: docs/twarc1_sw_ke.md ================================================ twarc1 ===== twarc ni chombo ya command-line na Python Library ya kuhifadhi Twitter JSON data. Kila Tweet ita akilishwa kama kitu ya JSON ita onyeshwa [hivi](https://dev.twitter.com/overview/api/tweets) kutoka kwa Twitter API. Tweets zita wekwa kama [line-oriented JSON](https://en.wikipedia.org/wiki/JSON_Streaming#Line-delimited_JSON). twarc ita kusaidia ku chunga [rate limits](https://dev.twitter.com/rest/public/rate-limiting) ya API ya Twitter. twarc pia ita sanya tweets, watumiaji wa Twitter, uwenendo za Twitter na ita hydrate tweet ids. twarc imeundwa kama sehemu ya [Documenting the Now](http://www.docnow.io) ambayo ilifadhiliwa na [Mellon Foundation](https://mellon.org/). ## Weka Kabla kutumia twarc utahitaji kujiandikisha kwa [apps.twitter.com](http://apps.twitter.com). Mara baada ya kuunda programu yako andika `consumer key` and `consumer secret` yako alafu bonyeza kuzalisha `access token` na `access token secret`. Uta hitaji hizi vigezo nne ku tumia twarc 1. weka [Python](http://python.org/download) (2 or 3) 2. pip install twarc (ama kuboresha: pip install --upgrade twarc) ## Haraka Haraka Utahitaji kuambia twarc vifunguo ya API ya Twitter twarc configure alafu jaribu kuchungua na: twarc search blacklivesmatter > search.jsonl Ama wataka kusanya ma tweets kama zinatoka twarc filter blacklivesmatter > stream.jsonl Endelea kusoma ku pata maelezo kuhusu utumizi wa twarc ## Matumizi ### Sanidi Mara tu una vifunguo vya Twitter unaweza kuambia twarc ukitumia command ya `configure`. twarc configure twarc ita andika sifa zako kwenye file itayo itwa `.twarc` kwa saraka ya home. Kama hutaki ama huwezi kuandika file hiyo unaweza kutumia command inayo tumia mazingira yako. (`CONSUMER_KEY`, `CONSUMER_SECRET`, `ACCESS_TOKEN`, `ACCESS_TOKEN_SECRET`) ama chagua command line (`--consumer_key`, `--consumer_secret`, `--access_token`, `--access_token_secret`). ### Uchunguzi Hutumia [uchunguzi wa tweets](https://dev.twitter.com/rest/reference/get/search/tweets) kupakua tweets zilizoandikwa zinazo swala twarc search blacklivesmatter > tweets.jsonl Ni muhimu kukumbuka swali yako ita pakua tweets za mda wa siku 7 inayo tiwa na API ya Twitter. Kama swali yako inataka mda wa siku nane au zaidi waeza kutumia `filter` ama `sample` commands kama hizi. Njia bora ya kujifunza na uchunguzi wa Twitter Search API ni ku jaribu [Twitter's Advanced Search](https://twitter.com/search-advanced) alafu kuitumia kwa twarc. Kwa mfano hapa tuna tafuta ma tweets zinazo \#blacklivesmatter ama #blm hashtags zilizo tumwa kwa deray. twarc search '#blacklivesmatter OR #blm to:deray' > tweets.jsonl Twitter hujaribu kuweka lugha ya tweet na unaweza kupunguza kikoma yako kwa lugha ukitaka twarc search '#blacklivesmatter' --lang fr > tweets.jsonl Unaweza pia kutafuta tweets za mahali fulani kwa mfano tweets zinazo taja *blacklivesmatter* zilizo maili 1 kutoka katikati ya Ferguson, Missouri: twarc search blacklivesmatter --geocode 38.7442,-90.3054,1mi > tweets.jsonl Ikiwa swali yako haina maneno lakini umetumia `--geocode` utapata tweets zote za eneo hio. twarc search --geocode 38.7442,-90.3054,1mi > tweets.jsonl ### Chuja Utumizi wa `filter` command husanya tweets zikiandikwa no hutumia [statuses/filter](https://dev.twitter.com/streaming/reference/post/statuses/filter) API. twarc filter blacklivesmatter,blm > tweets.jsonl Tafadhali kumbuka kuwa syntax ya Twitter ni tofauti na Twitter ya uchunguzi. Tafadhali wasiliana na nyaraka jinsi ya kueleza chujia unayo tumia Tumia command ya `follow` kama wataka kusanya tweets kutoka kwa mtumiaji kama zinatokea. Hi inajumuisha retweets. Kwa mfano hii itasanya tweets na retweets za CNN: twarc filter --follow 759251 > tweets.jsonl Waeza kusanya tweets kwa kutumia sanduku linalozingatia. Kumbuka: dash inayoongoza inahitaji kutoroka katika sanduku linalozingatia ama ita fasiriwa kama command line argument! twarc filter --locations "\-74,40,-73,41" > tweets.jsonl Ikiwa unachanganya chaguzi yako au OR'ed pamoja. Kwa mfano hii ita sanya tweets zinasotumia blacklivesmatter ama blm na pia tweets kutoka mtumiaji CNN: twarc filter blacklivesmatter,blm --follow 759251 > tweets.jsonl ### Sampuli Tumia `sample` command kusikiliza kwa sampuli ya Twitter [statuses/sample](https://dev.twitter.com/streaming/reference/get/statuses/sample) statuses hivi karibuni twarc sample > tweets.jsonl ### Punguza maji twarc ina `dehydrate` command ita tengeneza orodha ya id kutoka faili ya tweets: twarc dehydrate tweets.jsonl > tweet-ids.txt ### Hydrate twarc pia ina `hydrate` command ita soma faili inayo id na ita andika faili mpya ya tweet JSON kwa kutumiya Twitter [status/lookup](https://dev.twitter.com/rest/reference/get/statuses/lookup) API. twarc hydrate ids.txt > tweets.jsonl API ya Twitter [Masharti ya Huduma](https://dev.twitter.com/overview/terms/policy#6._Be_a_Good_Partner_to_Twitter) huwazuia watu kutengeza kiasi kubwa ya Twitter data ipatikane kwenye Web. Hiyo data yaeza kutumiwa kwa uchunguzi bora isi shirikiana na ulimwengu. Twitter huruhusu mafaili ya tweet identifiers kugawanywa no hiyo inaweza kuwa na manufaa. Waeza kutumia API ya Twitter ku *hydrate* hiyo data ama kupata kamili ya JSON. Hi ni muhimu kwa [uthibitishaji](https://en.wikipedia.org/wiki/Reproducibility) ya social media research. ### Watumiaji Utumizi was `users` command hurudisha metadata ya majina ya skrini iliyopewa twarc users deray,Nettaaaaaaaa > users.jsonl Waeza pia kuipatia ids za watumiaji twarc users 1232134,1413213 > users.jsonl Waeza kutumia faili iliyo na ids za watumiaji kwa mfano wataka `followers` na `friends` commands twarc users ids.txt > users.jsonl ### Wafuasi Utumizi wa `followers` hutegemeya [follower id API](https://dev.twitter.com/rest/reference/get/followers/ids) ku kusanya ids za mfuasi moja kwa kila ombi. Kwa mfano: twarc followers deray > follower_ids.txt ita rudisha mfuasi moja kwa kila laini. Faili yako ita andikwa na wafuasi wa hivi karibuni kwanza. ### Mwelekeo Utumizi wa `trends` hutegemeya API ya Twitter ya mwelekeo wa hashtags. Unahitaji kuipatia [Where On Earth](http://developer.yahoo.com/geo/geoplanet/) identifier (`woeid`) kuiambia mwenendo unayopenda. Kwa mfano kama wataka maelekeo ya St. Louis: twarc trends 2486982 Ukitumia `woeid` ya 1 itarudisha mwenendo wa dunia yote. twarc trends 1 Ikiwa hujui nini cha kutumia ya `woeid` iache na utapata maeneo yote ambayo Twitter hufuata: twarc trends Kama una geo-location waeza kuitimia badala ya `woeid` twarc trends 39.9062,-79.4679 Twitter ita tumia API ya [trends/closest](https://dev.twitter.com/rest/reference/get/trends/closest) ili kupata `woeid` iliyo karibu nawe ### Muda wa wakati Utumiaji wa `timeline` command hutegemeya kwa API ya [user timeline API](https://dev.twitter.com/rest/reference/get/statuses/user_timeline) kukusanya Tweets za mtumiaji alionyeshwa na `screen_name`: twarc timeline deray > tweets.jsonl Unaweza pia kuangalia juu ya watumiaji kwa kutumia id ya mtumiaji twarc timeline 12345 > tweets.jsonl ### Retweets Unaweza kupata retweets kwa kuipeya id ya tweet hivi: twarc retweets 824077910927691778 > retweets.jsonl ### Majibu Twitter haina API ambayo inaweza kupata majibu za tweet. twarc hujaribu kwa kutumia search API. Lakino search API haiwezi kupata majibu zaidi ya siku saba. Ikiwa unataka kupata majibu ya tweets fanya hivi: twarc replies 824077910927691778 > replies.jsonl Utumizi wa `--recursive` utapata majibu ya majibu na quotes. Hii inaweza kuchukua muda mrefu kukamilisha kama una majibu mengi kwa sababu ya kiwango cha kupunguzwa search API. twarc replies 824077910927691778 --recursive ### Orodha Ili kupata watumiaji walio kwenye orodha unaweza kutumia URL ya orodha na command ya `listmembers` twarc listmembers https://twitter.com/edsu/lists/bots ## Tumia kama Maktaba Ikiwa unataka kutumia twarc programatically kama maktaba kukusanya tweets. Kwanza utahitaji kuunda `twarc` instance yako. (utatumia sifa zako za Twitter), alafu utaitumia kutafuta matokeo ya utafutaji, futa matokeo au matokeo ya kufuatilia. ```python from twarc import Twarc t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) for tweet in t.search("ferguson"): print(tweet["text"]) ``` Unaweza kufanya hivyo kwa mkondo wa machujio ya tweets ambazo zinafanana na kufuatilio neno muhimu: ```python for tweet in t.filter(track="ferguson"): print(tweet["text"]) ``` au mahali ```python for tweet in t.filter(locations="-74,40,-73,41"): print(tweet["text"]) ``` au ids za watumiaji ```python for tweet in t.filter(follow='12345,678910'): print(tweet["text"]) ``` Vivyo hivyo unaweza ku hydrate tweet identifiers kwa kupitisha orodha ya ids au jenereta: ```python for tweet in t.hydrate(open('ids.txt')): print(tweet["text"]) ``` ## Vya Kutumia Katika saraka `utils` kuna commands zinazo weza kukusaidia kufanya kazi na line-oriented JSON kama kuchapisha ma tweets kwa text au html, kuchimba majina za watumiaji, URLS. If tengeneza script yako tafadhali tushirikiana na PR. Unapopata tweets unaweza kuunda ukuta mzuri wako: % utils/wall.py tweets.jsonl > tweets.html Unaweza kuunda wingu ya maneno ya tweets ulizo sanya ambayo in neno nasa % utils/wordcloud.py tweets.jsonl > wordcloud.html Ikiwa umekusanya tweets kwa kutumia `majibu` unaweza kuunda taswira ya D3 na: % utils/network.py tweets.jsonl tweets.html Unaweza kuimarisha tweets za mtumiaji, kukuruhusu kuona akaunti kuu: % utils/network.py --users tweets.jsonl tweets.html Na kama unataka kutumia grafu ya mtandao katika mpango kama [Gephi](https://gephi.org/), unaweza kuuna faili ya GEXF na % utils/network.py --users tweets.jsonl tweets.gexf `gender.py` ni chujio kinachokuwezesha kufuta tweets kulingana na nadhani kuhusu jinsia ya mwandishi. Kwa mfano unaweza kufuta tweets zote ambazo kuangalia kama walikuwa kutoka kwa wanawake, na kuunda wingu neno na: % utils/gender.py --gender female tweets.jsonl | utils/wordcloud.py > tweets-female.html Unaweza kutoa [GeoJSON](http://geojson.org/) ya tweets kama geo coordinates ziko: % utils/geojson.py tweets.jsonl > tweets.geojson Unaweza pia kuto GeoJSON na centriods, kubadilisha nafasi ya masanduku: % utils/geojson.py tweets.jsonl --centroid > tweets.geojson Na ukitoa GeoJSON na centroids, unaweza kuongeza random fuzzing: % utils/geojson.py tweets.jsonl --centroid --fuzz 0.01 > tweets.geojson Ili kufuta tweets kwa kuwepo au kutokuwepo kwa kuratibu za geo (au Mahali, angalia nyaraka za [API](https://dev.twitter.com/overview/api/places)): % utils/geofilter.py tweets.jsonl --yes-coordinates > tweets-with-geocoords.jsonl % cat tweets.jsonl | utils/geofilter.py --no-place > tweets-with-no-place.jsonl Ili kufuta tweets na uzio wa GeoJSON (inahitaji [Shapely](https://github.com/Toblerity/Shapely)): % utils/geofilter.py tweets.jsonl --fence limits.geojson > fenced-tweets.jsonl % cat tweets.jsonl | utils/geofilter.py --fence limits.geojson > fenced-tweets.jsonl Ikiwa unadhani una duplicate kwenye tweets zako unaweza kuwapunguza: % utils/deduplicate.py tweets.jsonl > deduped.jsonl Unaweza kuchagua na ID, ambayo ni sawa na kutatua kwa wakati: % utils/sort_by_id.py tweets.jsonl > sorted.jsonl Unaweza kufuta tweets zote kabla ya tarehe fulani (kwa mfano, kama hashtag ilitumiwa kwa tukio lingine kabla ya moja unayopenda): % utils/filter_date.py --mindate 1-may-2014 tweets.jsonl > filtered.jsonl Unaweza kupata orodha ya HTML ya wateja kutumika: % utils/source.py tweets.jsonl > sources.html Ikiwa unataka kuondoa retweets: % utils/noretweets.py tweets.jsonl > tweets_noretweets.jsonl Au unshorten urls (requires [unshrtn](https://github.com/docnow/unshrtn)): % cat tweets.jsonl | utils/unshorten.py > unshortened.jsonl Mara baada ya kufuta URL zako unaweza kupata orodha ya vya URL inayo tweets nyingi zaidi: % cat unshortened.jsonl | utils/urls.py | sort | uniq -c | sort -nr > urls.txt ## twarc-report Baadhi ya scripts zaidi ya huduma ili kuzalisha csv au json pato yanafaa kwa kutumia na [D3.js](http://d3js.org/) visualizations hupatikana katika [twarc-report](https://github.com/pbinkley/twarc-report). `directed.py` ilikuwa sehemu ya twarc imehama kwa twarc-report kama `d3graph.py`. Kila script pia inaweza kuzalisha demo html ya taswira ya D3, kwa mfano. [timelines](https://wallandbinkley.com/twarc/bill10/) or a [directed graph of retweets](https://wallandbinkley.com/twarc/bill10/directed-retweets.html). [Kihispania]: https://github.com/DocNow/twarc/blob/main/README_es_mx.md [Kiingereza]: https://github.com/DocNow/twarc/blob/main/README_sv_se.md [Kijapani]: https://github.com/DocNow/twarc/blob/main/README_ja_jp.md [Kireno]: https://github.com/DocNow/twarc/blob/main/README_pt_br.md [Kisweden]: https://github.com/DocNow/twarc/blob/main/README_sw_ke.md ================================================ FILE: docs/twarc1_zw_zh.md ================================================ twarc1 ===== twarc 是一个用来处理并存档推特 JSON 数据的命令行工具和 Python 包。 [正如](https://dev.twitter.com/overview/api/tweets)推特 API 返回的一样,twarc 处理的每一条推文都用一个 JSON 对象来表示。twarc 会自动处理推特 API 的[流量限制](https://dev.twitter.com/rest/public/rate-limiting)。除了可以让你收集推文之外,twarc 还可以帮助你收集用户信息、当下流行的标签和根据 id 获得推文的详细信息。 twarc 是作为 [Mellon Foundation](https://mellon.org/) 资助下的 [Documenting the Now](http://www.docnow.io) 项目的一部分开发的。 ## 安装 在使用 twarc 之前,你需要在 [apps.twitter.com](http://apps.twitter.com) 注册一个应用。一旦你注册了你的应用,记下你的 `consumer key` 和 `consumer secret` 并点击生成一组 `access token` 和 `access token secret`. 这四个数据在手你就可以开始使用 twarc 了。 1. 安装 [Python](http://python.org/download) (2 或者 3) 2. [pip](https://pip.pypa.io/en/stable/installing/) install twarc ### 使用Homebrew (仅限macOS 系统) macOS系统用户, 你可以通过Homebrew安装 `twarc` : ```shell $ brew install twarc ``` ## 快速开始: 首先你需要告诉 twarc 你的应用 keys 并授权它访问一个或者多个推特账号: ```shell twarc configure ``` 然后尝试搜索 ```shell twarc search blacklivesmatter > search.jsonl ``` 或者你想试试实时搜索? ```shell twarc filter blacklivesmatter > stream.jsonl ``` 请阅读下文了解更多这些命令的意义和更多内容。 ## 使用 ### 配置 在获得应用 keys 之后你可以通过 `configure` 命令来告诉 twarc 它们的值。 ```shell twarc configure ``` 这样做会在你的 `~` 目录下创建一个名为 `.twarc` 的文件来储存你的这些凭证,这样你就不必每次使用 twarc 的时候输入它们。如果你倾向于每次使用 twarc 的时候输入 keys,你可以使用环境变量 (`CONSUMER_KEY`, `CONSUMER_SECRET`, `ACCESS_TOKEN`, `ACCESS_TOKEN_SECRET`) 或者使用命令行工具选项 (`--consumer_key`, `--consumer_secret`, `--access_token`, `--access_token_secret`). ### 搜索 搜索功能使用推特的[搜索推文](https://dev.twitter.com/rest/reference/get/search/tweets) API endpoint 来下载*已经存在*的符合搜索字符串的推文。 ```shell twarc search blacklivesmatter > tweets.jsonl ``` 尤其需要注意的是 `search` 返回的是过去七天内的推文:这是推特搜索 API 的限制。如果你觉得这太短了——我们也觉得——你或许会更愿意尝试使用下文提到的 `filter` 和 `sample` 命令。 最好的快速上手推特搜索语法的方法是实验[推特高级搜索](https://twitter.com/search-advanced)这个页面上的样例。你可以复制粘贴搜索框里的查询语句。比如这里有一个比较复杂的查询语句,它搜索包含有 `#blacklivesmatter` 和 `#blm` 关键字并发给 [deray](https://twitter.com/deray) 的推文。 ```shell twarc search '#blacklivesmatter OR #blm to:deray' > tweets.jsonl ``` 你还应当看一看 Igor Brigadir 关于推特高级搜索语法`精彩绝伦`的指南: [推特高级搜索 (英文)](https://github.com/igorbrigadir/twitter-advanced-search/blob/master/README.md). 这份指南里包含了很多阅读推特搜索文档后依然不显然的玄妙之处。 推特尝试显式地定义推文的语言。你可以尝试使用 [ISO 639-1] 规范限制你获得的推文的语言。 ```shell twarc search '#blacklivesmatter' --lang fr > tweets.jsonl ``` 你还可以通过位置来搜索。比如你可以搜索包含 `#blacklivesmatter` 且位置定位在密苏里弗格森半径1英里之内的推文。 ```shell twarc search blacklivesmatter --geocode 38.7442,-90.3054,1mi > tweets.jsonl ``` 如果一个包含 `--geocode` 的搜索没有包含要查询的字符串,那么你将得到所有与该位置和其半径相关的推文。 ```shell twarc search --geocode 38.7442,-90.3054,1mi > tweets.jsonl ``` ### 过滤 `filter` 命令使用推特的 [状态/过滤](https://dev.twitter.com/streaming/reference/post/statuses/filter) API 来搜集实时推文。 ```shell twarc filter blacklivesmatter,blm > tweets.jsonl ``` 请注意推特的 `track` 查询语句的语法和搜索 API 里的语法略有不同。请使用官方文档来了解如何最好地表达你的过滤命令选项。 使用 `follow` 命令行参数和用户的 id 来实时收集某个具体用户的推文。注意这个命令的结果包含转推。举个例子,下面的命令搜索 `CNN` 的推文和转推。 ```shell twarc filter --follow 759251 > tweets.jsonl ``` 你还可以限制一个地理上的矩形边界来收集推文。注意经纬度数据中的短横线必须用`\`转义,否则它将被理解成一个命令行参数! ```shell twarc filter --locations "\-74,40,-73,41" > tweets.jsonl ``` 你可以使用 `lang` 命令行参数来传入 [ISO 639-1] 语言代码来限制语言。你还可以多次使用这个参数指定多种语言。下面的例子实时收集提到了巴黎和马德里的法语推文和西班牙语推文: ```shell twarc filter paris,madrid --lang fr --lang es ``` `filter` 和 `follow` 命令是**或**关系。下面的例子将收集包含 `blacklivesmatter` 或者 `blm` 关键字的推文,或者是来自 CNN 的推文。 ```shell twarc filter blacklivesmatter,blm --follow 759251 > tweets.jsonl ``` 但是将位置和语言限制合并将得到**和**的关系,下面的例子收集来自纽约且被标记为法语或者西班牙语的推文。 ```shell twarc filter --locations "\-74,40,-73,41" --lang es --lang fr ``` ### 采样 使用 `sample` 命令来监听推特的 [状态/采样](https://dev.twitter.com/streaming/reference/get/statuses/sample) API 来“随机“采样最近的、公开的推文。 ```shell twarc sample > tweets.jsonl ``` ### `脱水` 所谓的脱水 `dehydrate` 命令读取一个推文的 jsonl 文件,生成一个包含推文 id 的列表。 ```shell twarc dehydrate tweets.jsonl > tweet-ids.txt ``` ### `补水` twarc 所谓的补水命令 `hydrate` 是 `dehydrate` 的反过程,它读取一个包含推文 id 的文件,使用推特的 [状态/检索](https://dev.twitter.com/rest/reference/get/statuses/lookup) API 重建包含完整推文 json 的 jsonl 文件。 ```shell twarc hydrate ids.txt > tweets.jsonl ``` 推特 API 的[服务条款](https://dev.twitter.com/overview/terms/policy#6._Be_a_Good_Partner_to_Twitter) 反对用户将大量原始推文数据公布在网络上。数据可以被用来研究使用和保存在本地,但是不可以和世界分享。不过,推特确实允许用户大量地将推文 id 公开分享,而这些 id 可以用来重建推文 JSON 数据——通过 `hydrate` 命令和推特的 API. 这一点对于社交媒体研究中的[复现](https://en.wikipedia.org/wiki/Reproducibility)尤为重要。 ### 用户 用户 `users` 命令可以返回(多个)用户的元数据。用户的名称由推特上的屏幕名称唯一确认。(译者注:屏幕名称即你 @ 某用户时所显示的字符串)。 ```shell twarc users deray,Nettaaaaaaaa > users.jsonl ``` 你也可以使用用户的 id. ```shell twarc users 1232134,1413213 > users.jsonl ``` 你也可以使用一个包含用户 id 的文件作为输入,这在你同时使用 `followers` 和 `friends` 命令时尤其有用。举例如下: ```shell twarc users ids.txt > users.jsonl ``` ### 粉丝 粉丝 `followers` 命令使用推特的 [粉丝 id](https://dev.twitter.com/rest/reference/get/followers/ids) API 来收集推特用户粉丝的 id 信息。该命令的输入只能是一个用户的屏幕名称。举例如下: ```shell twarc followers deray > follower_ids.txt ``` 输出的结果每一行是一个粉丝用户 id. 最新的粉丝将出现在最前面,依时间顺序倒序排列。 ### 朋友 和粉丝 `followers` 命令类似,朋友 `friends` 命令将使用推特的 [朋友 id](https://dev.twitter.com/rest/reference/get/friends/ids) API 收集推特用户朋友的 id 信息。该命令的输入只能是一个用户的屏幕名称。举例如下: ```shell twarc friends deray > friend_ids.txt ``` ### 当下流行 当下流行 `trends` 命令可以用来搜索当下流行的标签。你需要一个 [地球上哪里](https://web.archive.org/web/20180102203025/https://developer.yahoo.com/geo/geoplanet/) 的 id (woeid) 来指明你对哪个地理位置的当下流行标签感兴趣。下面这个例子中的 `2486982` 代表圣路易斯: ```shell twarc trends 2486982 ``` 令 `woeid` 为 1 即为搜索全球范围内当下流行的标签: ```shell twarc trends 1 ``` 如果你不确定 `woeid`, 可以留空,这样推特会返回一个列表,包括全球各地的当下流行标签。 ```shell twarc trends ``` 如果你已经知道确切的地理信息,可以用它来替代 `woeid`. ```shell twarc trends 39.9062,-79.4679 ``` 这里的原理是 twarc 将使用推特的[趋势/最近位置](https://dev.twitter.com/rest/reference/get/trends/closest) API 找到距离指定地点最近的 `woeid`. ### 时间线 时间线 `timeline` 命令将通过推特的[时间线](https://dev.twitter.com/rest/reference/get/statuses/user_timeline) API 收集某个用户最近的推文。用户名称由其屏幕名称指定。 ```shell twarc timeline deray > tweets.jsonl ``` 你也可以使用用户 id. ```shell twarc timeline 12345 > tweets.jsonl ``` ### 转推 你可以使用下面这个例子的格式来获得 id 为 `824077910927691778` 这条推文的转推。 ```shell twarc retweets 824077910927691778 > retweets.jsonl ``` 输入也可以是一个包含推文 id 的文本。 ```shell twarc retweets ids.txt > retweets.jsonl ``` ### 回复 推特的 API 不支持获得回复,但是 twarc 可以通过搜索 API 来近似模拟这一功能。因为搜索 API 的搜索时间区间只有过去一周所以 twarc 只能得到某条推文过去一周的回复。 下面这个例子使用推文 id 作为输入。 ```shell twarc replies 824077910927691778 > replies.jsonl ``` 使用 `--recursive` 选项可以获得回复的回复以及引用。注意这可能会花费很长时间因为推特的搜索 API 有流量限制。 ```shell twarc replies 824077910927691778 --recursive ``` ### 列表 你可以将推特用户列表的 URL 传入 `listmembers` 命令得到列表中的用户: ```shell twarc listmembers https://twitter.com/edsu/lists/bots ``` ## 付费搜索 API 推特引入了付费搜索 API. 它可以让你通过付款的方式实现更高级的搜索功能。你需要在[仪表板](https://developer.twitter.com/en/dashboard) 配置一个环境。在此之后,你可以搜索不限于最近7天内的推文的过去30天内的备份甚至完整推文备份。如果需要在命令行实现这一功能,你需要告诉 twarc 你在使用哪一个 endpoint 和环境。 为了控制预算,你可能需要限制搜索的时间段:使用 `--to_date` 和 `--frome_date`. 再次之外,你还可以使用 `--limit` 参数来限制返回的推文数目上限。 举例来看,假设今天是2020年6月1日,如果你想搜索不超过1000条从2020年5月1日到2020年5月14日所有提到 `blacklivesmatter` 的推文。如果我们的环境名为 `docnowdev`, 那么这个命令如下,注意我们使用了 `--30day` 这个 endpoint: ```shell twarc search blacklivesmatter \ --30day docnowdev \ --from_date 2020-05-01 \ --to_date 2020-05-14 \ --limit 1000 \ > tweets.jsonl ``` 类似的,如果你要搜索超过30天期限的全部推文备份,你需要使用 fullarchive, 举例如下: ```shell twarc search blacklivesmatter \ --fullarchive docnowdev \ --from_date 2014-08-04 \ --to_date 2014-08-05 \ --limit 1000 \ > tweets.jsonl ``` 如果你的环境在沙盒之中,你需要使用 `--sandbox` 参数来告诉 twarc 不要获得超过100条推文。默认的非沙盒环境的上限是500条。 ```shell twarc search blacklivesmatter \ --fullarchive docnowdev \ --from_date 2014-08-04 \ --to_date 2014-08-05 \ --limit 1000 \ --sandbox \ > tweets.jsonl ``` ## Gnip 企业级 API twarc 支持和 Gnip 推特全备份企业级 API 的完全整合。你需要使用 `--gnip_auth` 参数并设置好 `GNIP_USERNAME`、 `GNIP_PASSWORD`、 `GNIP_ACCOUNT` 三个环境变量。举例如下: ```shell twarc search blacklivesmatter \ --gnip_auth \ --gnip_fullarchive prod \ --from_date 2014-08-04 \ --to_date 2015-08-05 \ --limit 1000 \ > tweets.jsonl ``` ## 作为一个 Python 包的 twarc 如果你想在你自己的代码里使用 twarc 的话,你需要首先创建一个 `twarc` 实例,传入你的推特应用凭证然后用它进行搜索、过滤和检索。 举例如下: ```python from twarc import Twarc t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) for tweet in t.search("ferguson"): print(tweet["text"]) ``` 你还可以用同样的语法过滤满足关键字匹配的实时信息流。举例如下: ```python for tweet in t.filter(track="ferguson"): print(tweet["text"]) ``` 或者地点: ```python for tweet in t.filter(locations="-74,40,-73,41"): print(tweet["text"]) ``` 或者用户 id: ```python for tweet in t.filter(follow='12345,678910'): print(tweet["text"]) ``` 类似的,你还可以传入一个包含推特 id 的文件,“补水”以获得完整信息。举例如下: ```python for tweet in t.hydrate(open('ids.txt')): print(tweet["text"]) ``` ## 基于用户的验证和基于应用的验证 twarc 自动处理推特的流量限制。但是你应该了解流量限制会因为验证方式的不同而不同。推特有两种验证方式分别是基于用户的验证和基于应用的验证。 twarc 默认使用基于用户的验证方式但是你可以告诉 twarc 使用基于应用的验证。 举个例子,转为基于应用的验证可以显著提高搜索功能的效率。基于用户的验证每分钟可以发出180个请求(每天160万条结果),而基于应用的验证每分钟可以发出450个请求(每天430万个结果)。 需要注意的是,用 “补水”功能访问 `状态/检索 (status/lookup)` 这个 API endpoint 在基于用户的验证下有每15分钟900个请求的限制,而在基于应用的验证下是每15分钟300个。 如果你确认你要使用基于应用的验证,你可以使用 `--app_auth` 这个命令行选项。举例如下: ```shell twarc --app_auth search ferguson > tweets.jsonl ``` 类似的功能也可以在你的 Python 代码中实现。 ```python from twarc import Twarc t = Twarc(app_auth=True) for tweet in t.search('ferguson'): print(tweet['id_str']) ``` ## 实用工具 在 `utils` 文件夹下你可以找到几个脚本。这些脚本可以作用于 jsonl 文件上实现一些非常实用的功能:比如将 JSON 格式的推文输出为文本或者 HTML 格式, 提取用户名或者推文中引用的 URL 等等。如果你创作了一个好用的脚本,欢迎提出 PR. 下面的命令可以创作一个简单的推文墙。 ```shell utils/wall.py tweets.jsonl > tweets.html ``` 下面的命令可以创作一个简单的词云。 ```shell utils/wordcloud.py tweets.jsonl > wordcloud.html ``` 如果你用 `replies` 命令收集了一些推文,你可以用下面的命令创作一个静态的 D3 可视化。 ```shell utils/network.py tweets.jsonl tweets.html ``` 你可以增加可选参数根据用户组织推文,这样你可看到这个网络中的核心账号。 ```shell utils/network.py --users tweets.jsonl tweets.html ``` 额外的,你可以创作一个标签的网络,从而看到它们彼此之间的(共存)关系。 ```shell utils/network.py --hashtags tweets.jsonl tweets.html ``` 如果你想使用网络作图软件 [Gephi](https://gephi.org/),你可以用下面的命令生成一个 `GEXF` 格式的文件。 ```shell utils/network.py --users tweets.jsonl tweets.gexf utils/network.py --hashtags tweets.jsonl tweets.gexf ``` 额外的,如果你想将网络转换成一个随时间线动态变化(节点会出现和消失)的动态网络,你可以在 Gephi 中打开生成的 `GEXF` 文件,跟随这个[教程](https://seinecle.github.io/gephi-tutorials/generated-html/converting-a-network-with-dates-into-dynamic.html)实现。注意在 `tweets.gexf` 文件里,仅有 `start_date` 一栏但是却没有 `end_date` 一栏,这会导致节点出现在屏幕上后便不再消失。对于 Gephi 中的 `Time interval creation options` 跳出窗口,`Start time column` 应该是 `start_date`, 而 `End time column` 则是空白的。`Parse dates` 应该勾选,同时选择最后一个日期格式选项:`dd/MM/yyyy HH:mm:ss`, 如下图所示。 `gender.py` 是一个可以猜测推文作者性别的脚本。比如下面的例子展示了如何保留看上去像是女性发出的推文并生成一个词云。 ```shell utils/gender.py --gender female tweets.jsonl | utils/wordcloud.py > tweets-female.html ``` 你可以用含有地理定位信息的推文生成 [GeoJSON](http://geojson.org/) 格式的文件。 ```shell utils/geojson.py tweets.jsonl > tweets.geojson ``` 你还可以用地理边界的[形心](https://en.wikipedia.org/wiki/Centroid)来取代地理位置矩形的边界。 ```shell utils/geojson.py tweets.jsonl --centroid > tweets.geojson ``` 在此基础上你还可以加一些随机模糊。 ```shell utils/geojson.py tweets.jsonl --centroid --fuzz 0.01 > tweets.geojson ``` 欲了解更多关于利用地理坐标(或地点)的存在与否过滤推文的内容,请参考[文档](https://dev.twitter.com/overview/api/places)。下面是两个例子。 ```shell utils/geofilter.py tweets.jsonl --yes-coordinates > tweets-with-geocoords.jsonl cat tweets.jsonl | utils/geofilter.py --no-place > tweets-with-no-place.jsonl ``` 欲通过 GeoJson 的边界过滤推文,请参考下面的例子。注意你需要安装 [Shapely](https://github.com/Toblerity/Shapely). ```shell utils/geofilter.py tweets.jsonl --fence limits.geojson > fenced-tweets.jsonl cat tweets.jsonl | utils/geofilter.py --fence limits.geojson > fenced-tweets.jsonl ``` 如果你怀疑你有重复的推文,可以用下面的命令去重。 ```shell utils/deduplicate.py tweets.jsonl > deduped.jsonl ``` 你可以用下面的命令像根据时间线排序一样根据推文 id 排序。 ```shell utils/sort_by_id.py tweets.jsonl > sorted.jsonl ``` You can filter out all tweets before a certain date (for example, if a hashtag was used for another event before the one you're interested in): 你可以过滤调某一具体日期前的推文,举个例子,有可能这一日期前某个标签的含义并不是你感兴趣的意思。 ```shell utils/filter_date.py --mindate 1-may-2014 tweets.jsonl > filtered.jsonl ``` 你还能够以列表的形式得到客户端信息。 ```shell utils/source.py tweets.jsonl > sources.html ``` 下面的命令去除了转推。 ```shell utils/noretweets.py tweets.jsonl > tweets_noretweets.jsonl ``` 或者复原原始的 URL 的长度(需要安装[unshrtn](https://github.com/docnow/unshrtn))。 ```shell cat tweets.jsonl | utils/unshrtn.py > unshortened.jsonl ``` 一旦你获得了原始的 URL, 你可以根据推文中提到的次数对这些 URL 排序。 ```shell cat unshortened.jsonl | utils/urls.py | sort | uniq -c | sort -nr > urls.txt ``` ## twarc-report 项目 还有一些可以生成 csv 或者 json 输出以供 [D3.js](http://d3js.org/) 可视化使用的脚本可以在 [twarc-report](https://github.com/pbinkley/twarc-report) 项目中找到。原本属于 twarc 一部分的 `directed.py` 脚本也已经被转移到了 twarc-report 项目并被重命名为 `d3graph.py`. 下面的这两个链接包含了两个生成 HTML 格式的 D3 可视化文件的例子。 1. [timelines](https://wallandbinkley.com/twarc/bill10/) 2. [directed graph of retweets](https://wallandbinkley.com/twarc/bill10/directed-retweets.html) [英语]: https://github.com/DocNow/twarc/blob/main/README.md [日语]: https://github.com/DocNow/twarc/blob/main/README_ja_jp.md [葡萄牙语]: https://github.com/DocNow/twarc/blob/main/README_pt_br.md [西班牙语]: https://github.com/DocNow/twarc/blob/main/README_es_mx.md [瑞典语]: https://github.com/DocNow/twarc/blob/main/README_sv_se.md [斯瓦希里语]: https://github.com/DocNow/twarc/blob/main/README_sw_ke.md [ISO 639-1]: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes ================================================ FILE: docs/twarc2_en_us.md ================================================ # twarc2 twarc2 is a command line tool and Python library for archiving Twitter JSON data. Each tweet is represented as a JSON object that was returned from the Twitter API. Since Twitter's introduction of their [v2 API](https://developer.twitter.com/en/docs/twitter-api/api-reference-index#v2) the JSON representation of a tweet is conditional on the types of fields and expansions that are requested. twarc2 does the work of requesting the highest fidelity representation of a tweet by requesting all the available data for tweets. Tweets are streamed or stored as [line-oriented JSON](https://en.wikipedia.org/wiki/JSON_Streaming#Line-delimited_JSON). twarc2 will handle Twitter API's [rate limits](https://dev.twitter.com/rest/public/rate-limiting) for you. In addition to letting you collect tweets twarc can also help you collect users and hydrate tweet ids. It also has a collection of [plugins](plugins) you can use to do things with the collected JSON data (such as converting it to CSV). twarc2 was developed as part of the [Documenting the Now](http://www.docnow.io) project which was funded by the [Mellon Foundation](https://mellon.org/). ## Install Before using twarc you will need to create an application and attach it to an project on your [Twitter Developer Portal](https://developer.twitter.com/en/portal/projects-and-apps). A ["Project"](https://developer.twitter.com/en/docs/projects/overview) is like a container for an "Application" with a specific purpose. If you have Academic Access you should see an "Academic Research" Project, if not, you should see only "Standard" Project. Academic Access is a separate endpoint, see [here](twitter-developer-access.md) for notes on this. Once you've created your application, note down the Bearer token, and or the consumer key, consumer secret, which may also be called API Key and API Secret and then optionally click to generate an access token and access token secret. With these four variables in hand you are ready to start using twarc. 1. install [Python 3](http://python.org/download) 2. [pip](https://pip.pypa.io/en/stable/installing/) install twarc from a terminal (such as the Windows Command Prompt available in the "start" menu, or the [OSX Terminal application](https://support.apple.com/en-au/guide/terminal/apd5265185d-f365-44cb-8b09-71a064a42125/mac)): ``` pip install --upgrade twarc ``` ### Homebrew (macOS only) For macOS users, you can also install `twarc` via [Homebrew](https://brew.sh/): ```bash brew install twarc ``` ### Windows If you installed with pip and see a "failed to create process" when running twarc try reinstalling like this: python -m pip install --upgrade --force-reinstall twarc ## Quickstart: First you're going to need to tell twarc about your application API keys and grant access to one or more Twitter accounts: twarc2 configure Then try out a search: twarc2 search "blacklivesmatter" results.jsonl Or maybe you'd like to collect tweets as they happen? twarc2 filter "blacklivesmatter" results.jsonl See below for the details about these commands and more. ## Configure Once you've got your Twitter developer access set up you can tell twarc what they are with the `configure` command. twarc2 configure This will store your credentials in your home directory so you don't have to keep entering them in. You can most of twarc's functionality by simply configuring the *bearer token*, but if you want it to be complete you can enter in the *API key* and *API secret*. You can also the keys in the system environment (`CONSUMER_KEY`, `CONSUMER_SECRET`, `ACCESS_TOKEN`, `ACCESS_TOKEN_SECRET`) or using command line options (`--consumer-key`, `--consumer-secret`, `--access-token`, `--access-token-secret`). ## Search This uses Twitter's [tweets/search/recent](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-recent) and [tweets/search/all](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all) endpoints to download *pre-existing* tweets matching a given query. This command will search for any tweets mentioning *blacklivesmatter* from the 7 days. twarc2 search "blacklivesmatter" results.jsonl If you have access to the [Academic Research Product Track](https://developer.twitter.com/en/products/twitter-api/academic-research) you can search the full archive of tweets by using the `--archive` option. twarc2 search --archive "blacklivesmatter" results.jsonl The queries can be a lot more expressive than matching a single term. For example this query will search for tweets containing either `blacklivesmatter` or `blm` that were sent to the user \@deray. twarc2 search "(blacklivesmatter OR blm) to:deray" results.jsonl The best way to get familiar with Twitter's search syntax is to consult Twitter's [Building queries for Search Tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query) documentation. You also should definitely check out Igor Brigadir's *excellent* reference guide to the Twitter Search syntax: [Advanced Search on Twitter](https://github.com/igorbrigadir/twitter-advanced-search/blob/master/README.md). There are lots of hidden gems in there that the advanced search form doesn't make readily apparent. ### Limit Because there is a 500,000 tweet limit (5, or sometimes 10 million for Academic Research Track) you may want to limit the number of tweets you retrieve by using `--limit`: twarc2 search --limit 5000 "blacklivesmatter" results.jsonl ### Time You can also limit to a particular time range using `--start-time` and/or `--end-time`, which can be especially useful in conjunction with `--archive` when you are searching for historical tweets. twarc2 search --start-time 2014-07-17 --end-time 2014-07-24 '"eric garner"' tweets.jsonl If you leave off --start-time or --end-time it will be open on that side. So for example to get all "eric garner" tweets before 2014-07-24 you would just leave off the `--start-time`: twarc2 search --end-time 2014-07-24 '"eric garner"' tweets.jsonl ### Sort Order By default, Twitter returns the results ordered by their published date with the newest tweets being first. To alter this behavior, it is possible to specify the `--sort-order` parameter. Currently, it supports `recency` (the default) or `relevancy`. In the latter case, tweets are ordered based on what Twitter determines to be the best results for your query. ## Searches Searches works like the [search](#search) command, but instead of taking a single query, it reads from a file containing many queries. You can use the same limit and time options just like a single search command, but it will be applied to every query. The input file for this command needs to be a plain text file, with one line for each query you want to run, for example you might have a file called `animals.txt` with the following lines: cat dog mouse OR mice Note that each line will be passed through directly to the Twitter API - if you have quoted strings, they will be treated as a phrase search by the Twitter API, which might not be what you intended. If you run the following `searches` command, `animals.json` will contain at least 100 tweets for each query in the input file: twarc2 searches --limit 100 animals.txt animals.json You can use the `--archive` and `--start-time` flags just like a regular search command too, in this case to search the full archive of all tweets for the first day of 2020: twarc2 searches --archive --start-time 2020-01-01 --end-time 2020-01-02 animals.txt animals.json You can also use the `--counts-only` flag to check volumes first. This produces a csv file in the same format as the [counts](#counts) command with the `--csv` flag, with the addition of a column containing the query for that row. twarc2 searches --counts-only animals.txt animals_counts.csv One more thing - if you have a lot searches you want to run, you might want to consider using the `--combine-queries` flag. This combines consecutive queries into the file into a single longer query, meaning you issue fewer API calls and potentially collect fewer duplicate tweets that match more than one query. Using this on the `animals.txt` file as input will combine the three queries into the single longer query `(cat) OR (dog) OR (mouse OR mice)`, and only issue one logical query. twarc2 searches --combine-queries animals.txt animals_combined.json ## Stream The `stream` command will use Twitter's API [tweets/search/stream](https://developer.twitter.com/en/docs/twitter-api/tweets/filtered-stream/api-reference/get-tweets-search-stream) endpoint to collect tweets as they happen. In order to use it you first need to create one or more [rules]. For example: twarc2 stream-rules add blacklivesmatter You can list your active stream rules: twarc2 stream-rules list And you can collect the data from the stream, which will bring down any tweets that match your rules: twarc2 stream stream.jsonl When you want to stop you use `ctrl-c`. This only stops the stream but doesn't delete your stream rule. To remove a rule you can: twarc2 stream-rules delete blacklivesmatter ## Sample Use the `sample` command to listen to Twitter's [tweets/sample/stream](https://developer.twitter.com/en/docs/twitter-api/tweets/sampled-stream/api-reference/get-tweets-sample-stream) API for a "random" sample of recent public statuses. The sampling is based on the millisecond part of the tweet timestamp. twarc2 sample sample.jsonl ## Users If you have a file of user ids you can fetch the user metadata for them with the `users` command: twarc users users.txt users.jsonl If the file contains usernames instead of user ids you can use the `--usernames` option: twarc2 users --usernames users.txt users.jsonl ## Followers You can fetch the followers of an account using the `followers` command: twarc2 followers deray users.jsonl ## Following To get the users that a user is following you can use `following`: twarc2 following deray users.jsonl The result will include exactly one user id per line. The response order is reverse chronological, or most recent followers first. ## Timeline The `timeline` command will use Twitter's [user timeline API](https://developer.twitter.com/en/docs/twitter-api/tweets/timelines/api-reference/get-users-id-tweets) to collect the most recent tweets posted by the user indicated by screen_name. twarc2 timeline deray tweets.jsonl ## Conversation You can retrieve a conversation thread using the tweet ID at the head of the conversation: twarc2 conversation 266031293945503744 > conversation.jsonl ## Likes Twarc supports the two approaches that the Twitter API exposes for collecting likes via the `liked-tweets` and `liking-users` commands. The `liked-tweets` command returns the tweets that have been liked by a specific account. The account is specified by the user ID of that account, in the following example is the account of Twitter's founder: twarc2 liked-tweets 12 jacks-likes.jsonl In this case the output file contains all of the likes of publicly accessible tweets. Note that the order of likes is not guaranteed by the API, but is probably reverse chronological, or most recent likes by that account first. The underlying tweet objects contain no information about when the tweet was liked. The `liking-users` command returns the user profiles of the accounts that have liked a specific tweet (specified by the ID of the tweet): twarc2 liking-users 1460417326130421765 liking-users.jsonl In this example the output file contains all of the user profiles of the publicly accessible accounts that have liked that specific tweet. Note that the order of profiles is not guaranteed by the API, but is probably reverse chronological, or the profile of the most recent like for that account first. The underlying profile objects contain no information about when the tweet was liked. Note that likes of tweets that are not publicly accessible, or likes by accounts that are protected will not be retrieved by either of these methods. Therefore, the metrics available on a tweet object (under the `public_metrics.like_count` field) will likely be higher than the number of likes you can retrieve via the Twitter API using these endpoints. ## Retweets You can retrieve the user profiles of publicly accessible accounts that have retweeted a specific tweet, using the `retweeted_by` command and the ID of the tweet as an identifier. For example: twarc2 retweeted-by 1460417326130421765 retweeting-users.jsonl Unfortunately this only returns the user profiles (presumably in reverse chronological order) of the retweeters of that tweet - this means that important information, like when the tweet was retweeted is not present in the returned object. ## Dehydrate The `dehydrate` command generates an id list from a file of tweets: twarc2 dehydrate tweets.jsonl tweet-ids.txt ## Hydrate twarc's `hydrate` command will read a file of tweet identifiers and write out the tweet JSON for them using Twitter's [tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/lookup/api-reference/get-tweets) API endpoint: twarc2 hydrate ids.txt tweets.jsonl The input file, `ids.txt` is expected to be a file that contains a tweet identifier on each line, without quotes or a header: ``` 919505987303886849 919505982882844672 919505982602039297 ``` Twitter API's [Terms of Service](https://dev.twitter.com/overview/terms/policy#6._Be_a_Good_Partner_to_Twitter) discourage people from making large amounts of raw Twitter data available on the Web. The data can be used for research and archived for local use, but not shared with the world. Twitter does allow files of tweet identifiers to be shared, which can be useful when you would like to make a dataset of tweets available. You can then use Twitter's API to *hydrate* the data, or to retrieve the full JSON for each identifier. This is particularly important for [verification](https://en.wikipedia.org/wiki/Reproducibility) of social media research. ## Places The search and stream APIs allow you to search by places. But in order to use them you need to know the identifier for a specific place. twarc's `places` command will let you search by the place name, geo coordinates, or ip address. For example: twarc2 places Ferguson Which will output something like: ```shell $ twarc2 places Ferguson Ferguson, MO, United States [id=0a62ce0f6aa37536] Ruisseau-Ferguson, Québec, Canada [id=25283a1f59449e8f] Ferguson, Victoria, Australia [id=2538e66b7e5c082c] Ferguson Road Initiative, Dallas, United States [id=368aad647311292a] Ferguson, Western Australia, Australia [id=45f20c78d803ad84] Ferguson, PA, United States [id=00c92e14361c9674] Ferguson, KY, United States [id=0190ea5612aaae32] ``` You can then use one of the ids in a search: twarc2 search "place:0a62ce0f6aa37536" tweets.jsonl You can also search by geo-coordinates (lat,lon) and IP address. If you would prefer to see the full JSON response with the bounding boxes use the `--json` option. ## Command Line Usage Below is what you see when you run `twarc2 --help`. ::: mkdocs-click: :module: twarc.command2 :command: twarc2 :depth: 1 ================================================ FILE: docs/twitter-developer-access.md ================================================ # Twitter Developer Access If you have established that you would like to use Twitter Data in your study, you will need access to the API. There are several steps required to get access to the API. This is a guide on how best to engage with this process. Allow plenty of time for this. Twitter has made the process of accessing their API more strict. There are a number of restricted use cases that may require you implement additional safeguards. Before applying, the Terms of Service for Developers and the [Restricted Use Cases](https://developer.twitter.com/en/developer-terms/more-on-restricted-use-cases) are very short and relevant to read. ## Step 0: Have a Twitter account in good standing Create and or edit your Twitter profile to fit your person or organization, preferably in English. Make sure it's public and you do the basic things like verifying your email and phone number (do not use a VoIP service), setting a non default profile picture and header, a description, links to your research group or website, a good description that identifies you as you, and preferably some friends and followers who are already on twitter in your research community. Use a good stable email provider (gmail) or your institution email as long as it is reliable and you can see any emails that may end up in spam, just in case. ## Step 1: Applying for a Developer Account Fill out the forms for a new Individual developer Account here: . Team accounts are not supported with Academic Access, so do not apply for a Team account. Pay attention to the specifics of each question: especially about sharing data outside of your organization, and with other government entities. Wait for a reply. This may take a couple of weeks. ## Step 2: Apply for the special Academic Access v2 Endpoint Even if you specify your use case as "Academic" use case in your developer application form, you will not automatically get access to the [new Search endpoint](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all) with higher limits for academic use. You must fill in an additional form: Twitter generally prefers to grant access to faculty and postgrad researchers, not undergrad or masters students or contractors or collaborators. It may be better for the principal investigator or professor to log in from an institution account or their own one, provided it is in good standing and has an obviously identifiable online academic presense. This application may also take a couple of days or weeks. ## Step 3: Create a Project and App A Project with Academic Access should be created for you, or if you did not get Academic Access, you can create a new Standard Project. On your Dashboard you should see "Academic Research" or "Standard" and "Standalone Apps". Before accessing the v2 API, you will need to create an App or use an existing one and add it to the Academic Access Project first. You can only have 1 App assigned to 1 Project. When Creating an app, take note of the keys you are given: API Key: ``` hCe77nsrgew3gsdhSDGFSgsdf ``` API Secret: ``` 1jWERGWBrtRTWBTwGFDHGFH66SDFGSDFGSSDFGSDFGSSDFGa11 ``` Bearer Token: ``` AAAAAAAAAAAAAAAAAAAAAAAsdfgsAAAAvSDFGSDRgssdfSDFGSDF44gsd4E%3Dkk33345336dfsgsdgsdgsdASGASDGadsGAFAKJGYIUYUIDGGKK ``` These are fake but have the same format as real ones. Note the `%` sign in the Bearer Token - this can often cause errors when copy pasting or providing this token in a command line. Other common causes of errors are including a trailing space, or extra `"` or `'` quotes or not quoting the string in code or command line. This depends on implementation. These are important to save and [store as you would a password](https://developer.twitter.com/en/docs/authentication/guides/authentication-best-practices). Continue to "App Settings" and fill in the description field of the app. You don't need to change any other settings here. Generally you will only need Read Only Access and will not need "3-legged OAuth" or callback URLs unlesws you plan on using the [Account Activity API](https://developer.twitter.com/en/docs/twitter-api/enterprise/account-activity-api/overview) if you want to make an interactive Bot for example. A project must *contain* an app. The difference between a [Project](https://developer.twitter.com/en/docs/projects/overview) and [App](https://developer.twitter.com/en/docs/apps/overview) is sometimes confusing. *Standalone Apps* are for `v1.1` endpoints, Standard and Academic Access *Projects* are for `v2` endpoints. ## Step 4: Collaborating with Others Now that you have your keys and tokens, you can start using the API. You may be working with other people on implementations, so you may have to share your keys with someone at some point. Do not share your Twitter user and password details for the Developer Dashboard. This is not a good idea. Currently Twitter's "Teams" functionality is also incompatible with Academic Access. The best way is to provide your colaborator with the keys in a plain text configuration file that you securely share. Or as Environment variables. When someone has your keys, they have full access to the API on your behalf. Be careful not to commit your keys into a public repository or make them visible to the public - do not include them in a client side js script for example. Most apps will ask for API Key and Secret, but "Consumer Key" is "API Key" and "Consumer Secret" is "API Secret". For Academic Access, there is only one endpoint that takes Bearer (App Only) authentication, so in most cases, the Bearer Token is all you need to share. ## Step 5: Next Steps Install `twarc`, and run `twarc2 configure` to set it up. To make arbitrary API calls for testing, [twurl](https://github.com/twitter/twurl) is a good tool, when combined with [jq](https://stedolan.github.io/jq/). To get help, a good place is the [Developer Forums](https://twittercommunity.com/), or the [DocNow Slack](https://docs.google.com/forms/d/1Wk0JdF2Cty2VHMqpf_QlJXVKQdUtfeeFhaYRben3qaM/viewform), or [Stackoverflow](https://stackoverflow.com/) for implementation details, or the repository [Issues](https://github.com/DocNow/twarc) if it's an issue with twarc or one of the addons. To share and publish a Twitter Dataset, extract the Tweet IDs and or User IDs, and format these as 1 ID per line in a plain text file (optionally, you can compress this file). This will make your dataset easier to process for others. See the [DocNow Catalog](https://catalog.docnow.io/) and tools like [Zenodo](https://zenodo.org/) and [Figshare](https://figshare.com/). ================================================ FILE: docs/windows10.md ================================================ # twarc2 on Windows 10 This guide assumes you already have a Twitter Developer Account, a registered App with your keys and a Bearer Token, and Python installed on Windows. ## Prerequisites and Installation You must have Python installed and working on Windows. Python will be located in different places on your computer if you installed Python from either the [official website](https://www.python.org/downloads/windows/), or from the [Microsoft App store](https://www.microsoft.com/en-us/p/python-38/9mssztt1n39l), or via [Anaconda](https://www.anaconda.com/products/individual#windows). Check that you can run these successfully: Open the command line `cmd.exe` or `PowerShell` or `Windows Terminal Preview` and run: `python --version` and `pip --version` If both give you some version output without errors everything is ready to go. Otherwise, install and configure `python` and `pip`. `twarc2` CLI works best through [Windows Terminal Preview](https://www.microsoft.com/en-us/p/windows-terminal-preview/9n8g5rfz9xk3?activetab=pivot:overviewtab) ## Setting up twarc2 Install `twarc2` with `pip install --upgrade twarc` If you get a warning like ``` WARNING: The scripts twarc.exe and twarc2.exe are installed in 'C:\Users\t495\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\Scripts' which is not on PATH. Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. ``` You will need to add that folder to the PATH. This will be different for your machine, so make sure to copy the full folder location from the command prompt, without the `'` quotes with `CTRL+C`. Make sure that folder is set in PATH System Variables: In Settings, find "edit the system environment variables" After clicking on "Environment Variables" Edit the "Path" variable in User Variables and add a new entry, in my case it was `C:\Users\t495\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\Scripts` but for you it will be different. Copy this from the warning it gives you, because it varies. You should now be able to run `twarc2` from the command line: `twarc2` If you can see the instructions, everything is ready to go. In powershell or command prompt, run: `twarc2 configure` Paste in your Bearer token, taking care not to accidentally copy an extra new line or space. It's not recommended to type these in manually, the API Secret entry will also not display what's being typed, but it still accepts input. If something went wrong, you can repeat the command and start over. The keys will be saved in a file that youcan use Notepad to view, saved in `C:\Users\youraccount\AppData\Roaming\twarc\config` or sometimes a different location, twarc will output the location of this file after the command runs. When this is completed, twarc2 is ready to use. ## Escaping `"` Characters in Windows The query you specify to search can contain `"` quotes for phrases, spaces and other special characters like `:` and `()`. When entered directly into the prompt these can be interpreted as part of the command, not part of the command line argument value. Windows has an odd way of escaping characters in the command line. To use a `"` in a query, change it to `""` in Windows. The more common escape `\"` does not work. For example, if you want to search for tweets that contain the phrase `"live laugh love"` or `"home sweet home"` in english, from the US, the query would be: ``` lang:en ("live laugh love" OR "home sweet home") place_country:US ``` Changing the `"` to `""` The twarc2 command (`--limit` is optional) for this would be: ``` twarc2 search --limit 500 "lang:en (""live laugh love"" OR ""home sweet home"") place_country:US" output.json ``` This Stackoverflow answer has the long version that explains why this works: https://stackoverflow.com/a/15262019 ## Output Format Errors: If you see this kind of error, for example when using `twarc2 flatten`: > ⚡ Expecting value: line 1 column 1 (char 0) It means the file was incorrectly saved. There is an edge case in Windows when writing output, do not use `>` to redirect `stdout`. This alters how files are written, and adds a BOM (Byte Order Mark) that makes the files unreadable to twarc for later, eg: when using `twarc2 flatten`. To fix the file, edit it in a Hex editor to remove the first 2 bytes. For example, this will give you a bad file with a BOM: `twarc2 search --limit 100 "dogs" > dogs.json` While this will give you a correctly written UTF8 file: `twarc2 search --limit 100 "dogs" dogs.json` Do not redirect stdout to a file in Windows, instead - specify the output file as a command line argument. ================================================ FILE: mkdocs.yml ================================================ site_name: twarc site_url: https://readthedocs.org/projects/twarc-project/ site_description: Collect Twitter JSON data from the command line. repo_url: https://github.com/docnow/twarc repo_name: twarc edit_uri: edit/main/docs/ theme: name: "material" logo: images/docnow.png palette: scheme: preference nav: - Home: README.md - twarc2: - twarc2 (en): twarc2_en_us.md - twarc1: - twarc1 (en): twarc1_en_us.md - twarc1 (es): twarc1_es_mx.md - twarc1 (ja): twarc1_ja_jp.md - twarc1 (pt): twarc1_pt_br.md - twarc1 (sv): twarc1_sv_se.md - twarc1 (sw): twarc1_sw_ke.md - twarc1 (zw): twarc1_zw_zh.md - Plugins: plugins.md - Tutorial: tutorial.md - Resources: resources.md - Twitter Developer Access: twitter-developer-access.md - Windows 10: windows10.md - Library API: - api/client.md - api/client2.md - api/library.md - api/expansions.md plugins: - search - mkdocstrings markdown_extensions: - mkdocs-click - pymdownx.highlight - pymdownx.superfences ================================================ FILE: pyproject.toml ================================================ [project] name = "twarc" version = "2.14.1" description = "Archive tweets from the command line" license = "MIT" readme = "README.md" requires-python = ">=3.13" dependencies = [ "click>=7,<9", "click-config-file>=0.6", "click-plugins>=1", "humanize>=3.9", "python-dateutil>=2.8", "requests_oauthlib>=1.3", "tqdm>=4.62", "twarc-csv>=0.7.2", ] [dependency-groups] dev = [ "black>=25.9.0", "pytest>=8.4.2", "pytest-black>=0.6.0", "python-dotenv>=1.2.1", "pytz>=2025.2", "toml>=0.10.2", ] [project.scripts] twarc = "twarc.command:main" twarc2 = "twarc.command2:twarc2" [tool.pytest.ini_options] addopts = "--verbose --black" [tool.uv.workspace] members = [ "tmp/twarc", ] [build-system] requires = ["uv_build>=0.8.3,<0.9.0"] build-backend = "uv_build" ================================================ FILE: requirements-mkdocs.txt ================================================ click>=7,<9 click-config-file>=0.6 click-plugins>=1 humanize>=3.9 python-dateutil>=2.8 requests_oauthlib>=1.3 tqdm>=4.62 mkdocs>=1.2 mkdocs-click>=0.4 mkdocs-material>=7.2 mkdocstrings[python]>=0.15 ================================================ FILE: setup.cfg ================================================ [tool:pytest] addopts=--verbose --black [aliases] test=pytest ================================================ FILE: src/twarc/__init__.py ================================================ from .client import Twarc from .client2 import Twarc2 from .version import version from .expansions import ensure_flattened ================================================ FILE: src/twarc/__main__.py ================================================ from twarc.command2 import twarc2 if __name__ == "__main__": twarc2(prog_name="python -m twarc2") ================================================ FILE: src/twarc/client.py ================================================ # -*- coding: utf-8 -*- import os import re import sys import json import types import logging import datetime import requests import ssl from requests.exceptions import ConnectionError from requests.packages.urllib3.exceptions import ProtocolError from .decorators import * from twarc.version import version, user_agent from requests_oauthlib import OAuth1, OAuth1Session, OAuth2Session from oauthlib.oauth2 import BackendApplicationClient if sys.version_info[:2] <= (2, 7): # Python 2 get_input = raw_input str_type = unicode import ConfigParser as configparser from urlparse import parse_qs else: # Python 3 get_input = input str_type = str import configparser from urllib.parse import parse_qs log = logging.getLogger("twarc") class Twarc(object): """ Twarc allows you retrieve data from the Twitter API. Each method is an iterator that runs to completion, and handles rate limiting so that it will go to sleep when Twitter tells it to, and wake back up when it is able to retrieve data from the API again. """ def __init__( self, consumer_key=None, consumer_secret=None, access_token=None, access_token_secret=None, connection_errors=0, http_errors=0, config=None, profile="", protected=False, tweet_mode="extended", app_auth=False, validate_keys=True, gnip_auth=False, gnip_username=None, gnip_password=None, gnip_account=None, ): """ Instantiate a Twarc instance. If keys aren't set we'll try to discover them in the environment or a supplied profile. If no profile is indicated the first section of the config files will be used. """ self.api_version = "1.1" self.consumer_key = consumer_key self.consumer_secret = consumer_secret self.access_token = access_token self.access_token_secret = access_token_secret self.connection_errors = connection_errors self.http_errors = http_errors self.profile = profile self.client = None self.last_response = None self.tweet_mode = tweet_mode self.protected = protected self.app_auth = app_auth self.gnip_auth = gnip_auth self.gnip_username = gnip_username self.gnip_password = gnip_password self.gnip_account = gnip_account if config: self.config = config else: self.config = self.default_config() self.get_keys() if validate_keys: self.validate_keys() @filter_protected def search( self, q, max_id=None, since_id=None, lang=None, result_type="recent", geocode=None, max_pages=None, ): """ Pass in a query with optional max_id, min_id, lang, geocode, or max_pages, and get back an iterator for decoded tweets. Defaults to recent (i.e. not mixed, the API default, or popular) tweets. """ url = "https://api.twitter.com/1.1/search/tweets.json" params = { "count": 100, "q": q, "include_ext_alt_text": "true", "include_ext_is_blue_verified": "true", "include_entities": "true", } if lang is not None: params["lang"] = lang if geocode is not None: params["geocode"] = geocode if since_id: # Make the since_id inclusive, so we can avoid retrieving # an empty page of results in some cases params["since_id"] = str(int(since_id) - 1) if result_type in ["mixed", "recent", "popular"]: params["result_type"] = result_type else: params["result_type"] = "recent" retrieved_pages = 0 reached_end = False while True: # note: max_id changes as results are retrieved if max_id: params["max_id"] = max_id resp = self.get(url, params=params) retrieved_pages += 1 statuses = resp.json()["statuses"] if len(statuses) == 0: log.info("no new tweets matching %s", params) break for status in statuses: # We've certainly reached the end of new results if since_id is not None and status["id_str"] == str(since_id): reached_end = True break yield status if reached_end: log.info("no new tweets matching %s", params) break if max_pages is not None and retrieved_pages == max_pages: log.info("reached max page limit for %s", params) break max_id = str(int(status["id_str"]) - 1) def premium_search( self, q, product, environment, from_date=None, to_date=None, max_results=None, sandbox=False, limit=0, ): """ Search using the Premium Search API. You will need to pass in a query a product (30day or fullarchive) and environment to use. Optionally you can pass in a from_date and to_date to limit the search using datetime objects. If you would like to set max_results you can, or you can accept the maximum results (500). If using the a sandbox environment you will want to set sandbox=True to lower the max_results to 100. The limit option will cause your search to finish after it has return more than that number of tweets (0 means no limit). """ if not self.app_auth and not self.gnip_auth: raise RuntimeError( "This endpoint is only available with application authentication. " "Pass app_auth=True in Python or --app-auth on the command line." ) if from_date and not isinstance(from_date, datetime.date): raise RuntimeError( "from_date must be a datetime.date or datetime.datetime object" ) if to_date and not isinstance(to_date, datetime.date): raise RuntimeError( "to_date must be a datetime.date or datetime.datetime object" ) if product not in ["30day", "gnip_fullarchive", "fullarchive"]: raise RuntimeError("Invalid Premium Search API product: {}".format(product)) # set default max_results based on whether its sandboxed if max_results is None: if sandbox: max_results = 100 else: max_results = 500 if product == "gnip_fullarchive": url = "https://gnip-api.twitter.com/search/fullarchive/accounts/{}/{}.json".format( self.gnip_account, environment ) else: url = "https://api.twitter.com/1.1/tweets/search/{}/{}.json".format( product, environment ) params = { "query": q, "fromDate": from_date.strftime("%Y%m%d%H%M") if from_date else None, "toDate": to_date.strftime("%Y%m%d%H%M") if to_date else None, "maxResults": max_results, } count = 0 stop = False while not stop: resp = self.get(url, params=params) if resp.status_code == 200: data = resp.json() for tweet in data["results"]: count += 1 yield tweet if limit != 0 and count >= limit: stop = True break if "next" in data: params["next"] = data["next"] else: stop = True elif resp.status_code == 422: raise RuntimeError( "Twitter API 422 response: are you using a premium search sandbox environment and forgot the --sandbox argument?" ) def timeline( self, user_id=None, screen_name=None, max_id=None, since_id=None, max_pages=None ): """ Returns a collection of the most recent tweets posted by the user indicated by the user_id or screen_name parameter. Provide a user_id or screen_name. """ if user_id and screen_name: raise ValueError("only user_id or screen_name may be passed") # Strip if screen_name is prefixed with '@' if screen_name: screen_name = screen_name.lstrip("@") id = screen_name or str(user_id) id_type = "screen_name" if screen_name else "user_id" log.info("starting user timeline for user %s", id) if screen_name or user_id: url = "https://api.twitter.com/1.1/statuses/user_timeline.json" else: url = "https://api.twitter.com/1.1/statuses/home_timeline.json" params = { "count": 200, id_type: id, "include_ext_alt_text": "true", "include_ext_is_blue_verified": "true", } retrieved_pages = 0 reached_end = False while True: if since_id: # Make the since_id inclusive, so we can avoid retrieving # an empty page of results in some cases params["since_id"] = str(int(since_id) - 1) if max_id: params["max_id"] = max_id try: resp = self.get(url, params=params, allow_404=True) retrieved_pages += 1 except requests.exceptions.HTTPError as e: if e.response.status_code == 404: log.warn("no timeline available for %s", id) break elif e.response.status_code == 401: log.warn("protected account %s", id) break raise e statuses = resp.json() if len(statuses) == 0: log.info("no new tweets matching %s", params) break for status in statuses: # We've certainly reached the end of new results if since_id is not None and status["id_str"] == str(since_id): reached_end = True break # If you request an invalid user_id, you may still get # results so need to check. if not user_id or id == status.get("user", {}).get("id_str"): yield status if reached_end: log.info("no new tweets matching %s", params) break if max_pages is not None and retrieved_pages == max_pages: log.info("reached max page limit for %s", params) break max_id = str(int(status["id_str"]) - 1) def user_lookup(self, ids, id_type="user_id"): """ A generator that returns users for supplied iterator of user ids or screen_names. Use the id_type to indicate which you are supplying (user_id or screen_name). """ if isinstance(ids, str): raise TypeError("ids must be an iterable other than a string") if id_type not in ["user_id", "screen_name"]: raise RuntimeError("id_type must be user_id or screen_name") if not isinstance(ids, types.GeneratorType): ids = iter(ids) # TODO: this is similar to hydrate, maybe they could share code? lookup_ids = [] def do_lookup(): ids_str = ",".join(lookup_ids) log.info("looking up users %s", ids_str) url = "https://api.twitter.com/1.1/users/lookup.json" params = { id_type: ids_str, "include_ext_is_blue_verified": "true", } try: resp = self.get(url, params=params, allow_404=True) except requests.exceptions.HTTPError as e: if e.response.status_code == 404: log.warning("no users matching %s", ids_str) raise e return resp.json() for id in ids: lookup_ids.append(str(id).strip()) if len(lookup_ids) == 100: for u in do_lookup(): yield u lookup_ids = [] if len(lookup_ids) > 0: for u in do_lookup(): yield u def follower_ids(self, user, max_pages=None): """ Returns Twitter user id lists for the specified user's followers. A user can be a specific using their screen_name or user_id """ user = str(user) user = user.lstrip("@") url = "https://api.twitter.com/1.1/followers/ids.json" if re.match(r"^\d+$", user): params = {"user_id": user, "cursor": -1} else: params = {"screen_name": user, "cursor": -1} retrieved_pages = 0 while params["cursor"] != 0: try: resp = self.get(url, params=params, allow_404=True) retrieved_pages += 1 except requests.exceptions.HTTPError as e: if e.response.status_code == 404: log.info("no users matching %s", user) raise e user_ids = resp.json() for user_id in user_ids["ids"]: yield str_type(user_id) params["cursor"] = user_ids["next_cursor"] if max_pages is not None and retrieved_pages == max_pages: log.info("reached max follower page limit for %s", params) break def friend_ids(self, user, max_pages=None): """ Returns Twitter user id lists for the specified user's friend. A user can be specified using their screen_name or user_id. """ user = str(user) user = user.lstrip("@") url = "https://api.twitter.com/1.1/friends/ids.json" if re.match(r"^\d+$", user): params = {"user_id": user, "cursor": -1} else: params = {"screen_name": user, "cursor": -1} retrieved_pages = 0 while params["cursor"] != 0: try: resp = self.get(url, params=params, allow_404=True) retrieved_pages += 1 except requests.exceptions.HTTPError as e: if e.response.status_code == 404: log.error("no users matching %s", user) raise e user_ids = resp.json() for user_id in user_ids["ids"]: yield str_type(user_id) params["cursor"] = user_ids["next_cursor"] if max_pages is not None and retrieved_pages == max_pages: log.info("reached max friend page limit for %s", params) break @filter_protected def filter( self, track=None, follow=None, locations=None, lang=[], event=None, record_keepalive=False, ): """ Returns an iterator for tweets that match a given filter track from the livestream of tweets happening right now. If a threading.Event is provided for event and the event is set, the filter will be interrupted. """ if locations is not None: if type(locations) == list: locations = ",".join(locations) locations = locations.replace("\\", "") url = "https://stream.twitter.com/1.1/statuses/filter.json" params = { "stall_warning": True, "include_ext_alt_text": True, "include_ext_is_blue_verified": "true", } if track: params["track"] = track if follow: params["follow"] = follow if locations: params["locations"] = locations if lang: # should be a list, but just in case if isinstance(lang, list): params["language"] = ",".join(lang) else: params["language"] = lang headers = {"accept-encoding": "deflate, gzip"} errors = 0 while True: try: log.info("connecting to filter stream for %s", params) resp = self.post(url, params, headers=headers, stream=True) errors = 0 for line in resp.iter_lines(chunk_size=1024): if event and event.is_set(): log.info("stopping filter") # Explicitly close response resp.close() return if not line: log.info("keep-alive") if record_keepalive: yield "keep-alive" continue try: yield json.loads(line.decode()) except Exception as e: log.error("json parse error: %s - %s", e, line) except requests.exceptions.HTTPError as e: errors += 1 log.error("caught http error %s on %s try", e, errors) if self.http_errors and errors == self.http_errors: log.warning("too many errors") raise e if e.response.status_code == 420: if interruptible_sleep(errors * 60, event): log.info("stopping filter") return else: if interruptible_sleep(errors * 5, event): log.info("stopping filter") return except Exception as e: errors += 1 log.error("caught exception %s on %s try", e, errors) if self.http_errors and errors == self.http_errors: log.warning("too many exceptions") raise e log.error(e) if interruptible_sleep(errors, event): log.info("stopping filter") return def sample(self, event=None, record_keepalive=False): """ Returns a small random sample of all public statuses. The Tweets returned by the default access level are the same, so if two different clients connect to this endpoint, they will see the same Tweets. If a threading.Event is provided for event and the event is set, the sample will be interrupted. """ url = "https://stream.twitter.com/1.1/statuses/sample.json" params = {"stall_warning": True} headers = {"accept-encoding": "deflate, gzip"} errors = 0 while True: try: log.info("connecting to sample stream") resp = self.post(url, params, headers=headers, stream=True) errors = 0 for line in resp.iter_lines(chunk_size=512): if event and event.is_set(): log.info("stopping sample") # Explicitly close response resp.close() return if line == "": log.info("keep-alive") if record_keepalive: yield "keep-alive" continue try: yield json.loads(line.decode()) except Exception as e: log.error("json parse error: %s - %s", e, line) except requests.exceptions.HTTPError as e: errors += 1 log.error("caught http error %s on %s try", e, errors) if self.http_errors and errors == self.http_errors: log.warning("too many errors") raise e if e.response.status_code == 420: if interruptible_sleep(errors * 60, event): log.info("stopping filter") return else: if interruptible_sleep(errors * 5, event): log.info("stopping filter") return except Exception as e: errors += 1 log.error("caught exception %s on %s try", e, errors) if self.http_errors and errors == self.http_errors: log.warning("too many errors") raise e if interruptible_sleep(errors, event): log.info("stopping filter") return def dehydrate(self, iterator): """ Pass in an iterator of tweets' JSON and get back an iterator of the IDs of each tweet. """ for line in iterator: try: yield json.loads(line)["id_str"] except Exception as e: log.error("uhoh: %s\n" % e) def hydrate(self, iterator, trim_user=False): """ Pass in an iterator of tweet ids and get back an iterator for the decoded JSON for each corresponding tweet. """ ids = [] url = "https://api.twitter.com/1.1/statuses/lookup.json" # lookup 100 tweets at a time for tweet_id in iterator: tweet_id = str(tweet_id) tweet_id = tweet_id.strip() # remove new line if present ids.append(tweet_id) if len(ids) == 100: log.info("hydrating %s ids", len(ids)) resp = self.post( url, data={ "id": ",".join(ids), "include_ext_alt_text": "true", "include_ext_is_blue_verified": "true", "include_entities": "true", "trim_user": trim_user, }, ) tweets = resp.json() tweets.sort(key=lambda t: t["id_str"]) for tweet in tweets: yield tweet ids = [] # hydrate any remaining ones if len(ids) > 0: log.info("hydrating %s", ids) resp = self.post( url, data={ "id": ",".join(ids), "include_ext_alt_text": "true", "include_ext_is_blue_verified": "true", "include_entities": "true", "trim_user": trim_user, }, ) for tweet in resp.json(): yield tweet def tweet(self, tweet_id): try: return next(self.hydrate([tweet_id])) except StopIteration: return [] def retweets(self, tweet_ids): """ Retrieves up to the last 100 retweets for the provided iterator of tweet_ids. """ if not isinstance(tweet_ids, types.GeneratorType): tweet_ids = iter(tweet_ids) for tweet_id in tweet_ids: if hasattr(tweet_id, "strip"): tweet_id = tweet_id.strip() log.info("retrieving retweets of %s", tweet_id) url = "https://api.twitter.com/1.1/statuses/retweets/" "{}.json".format( tweet_id ) try: resp = self.get(url, params={"count": 100}, allow_404=True) for tweet in resp.json(): yield tweet except requests.exceptions.HTTPError as e: if e.response.status_code == 404: log.info("can't get tweets for non-existent tweet: %s", tweet_id) def trends_available(self): """ Returns a list of regions for which Twitter tracks trends. """ url = "https://api.twitter.com/1.1/trends/available.json" try: resp = self.get(url) except requests.exceptions.HTTPError as e: raise e return resp.json() def trends_place(self, woeid, exclude=None): """ Returns recent Twitter trends for the specified WOEID. If exclude == 'hashtags', Twitter will remove hashtag trends from the response. """ url = "https://api.twitter.com/1.1/trends/place.json" params = {"id": woeid} if exclude: params["exclude"] = exclude try: resp = self.get(url, params=params, allow_404=True) except requests.exceptions.HTTPError as e: if e.response.status_code == 404: log.info("no region matching WOEID %s", woeid) raise e return resp.json() def trends_closest(self, lat, lon): """ Returns the closest regions for the supplied lat/lon. """ url = "https://api.twitter.com/1.1/trends/closest.json" params = {"lat": lat, "long": lon} try: resp = self.get(url, params=params) except requests.exceptions.HTTPError as e: raise e return resp.json() def replies(self, tweet, recursive=False, prune=()): """ replies returns a generator of tweets that are replies for a given tweet. It includes the original tweet. If you would like to fetch the replies to the replies use recursive=True which will do a depth-first recursive walk of the replies. It also walk up the reply chain if you supply a tweet that is itself a reply to another tweet. You can optionally supply a tuple of tweet ids to ignore during this traversal using the prune parameter. """ yield tweet # get replies to the tweet screen_name = tweet["user"]["screen_name"] tweet_id = tweet["id_str"] log.info("looking for replies to: %s", tweet_id) for reply in self.search("to:%s" % screen_name, since_id=tweet_id): if reply["in_reply_to_status_id_str"] != tweet_id: continue if reply["id_str"] in prune: log.info("ignoring pruned tweet id %s", reply["id_str"]) continue log.info("found reply: %s", reply["id_str"]) if recursive: if reply["id_str"] not in prune: prune = prune + (tweet_id,) for r in self.replies(reply, recursive, prune): yield r else: yield reply # if this tweet is itself a reply to another tweet get it and # get other potential replies to it reply_to_id = tweet.get("in_reply_to_status_id_str") log.info("prune=%s", prune) if recursive and reply_to_id and reply_to_id not in prune: t = self.tweet(reply_to_id) if t: log.info("found reply-to: %s", t["id_str"]) prune = prune + (tweet["id_str"],) for r in self.replies(t, recursive=True, prune=prune): yield r # if this tweet is a quote go get that too whatever tweets it # may be in reply to quote_id = tweet.get("quoted_status_id_str") if recursive and quote_id and quote_id not in prune: t = self.tweet(quote_id) if t: log.info("found quote: %s", t["id_str"]) prune = prune + (tweet["id_str"],) for r in self.replies(t, recursive=True, prune=prune): yield r def list_members( self, list_id=None, slug=None, owner_screen_name=None, owner_id=None ): """ Returns the members of a list. List id or (slug and (owner_screen_name or owner_id)) are required """ assert list_id or (slug and (owner_screen_name or owner_id)) url = "https://api.twitter.com/1.1/lists/members.json" params = {"cursor": -1} if list_id: params["list_id"] = list_id else: params["slug"] = slug if owner_screen_name: params["owner_screen_name"] = owner_screen_name else: params["owner_id"] = owner_id while params["cursor"] != 0: try: resp = self.get(url, params=params, allow_404=True) except requests.exceptions.HTTPError as e: if e.response.status_code == 404: log.error("no matching list") raise e users = resp.json() for user in users["users"]: yield user params["cursor"] = users["next_cursor"] def oembed(self, tweet_url, **params): """ Returns the oEmbed JSON for a tweet. The JSON includes an html key that contains the HTML for the embed. You can pass in parameters that correspond to the paramters that Twitter's statuses/oembed endpoint supports. For example: o = client.oembed('https://twitter.com/biz/status/21', theme='dark') """ log.info("generating embedding for tweet %s", tweet_url) url = "https://publish.twitter.com/oembed" params["url"] = tweet_url resp = self.get(url, params=params) return resp.json() @rate_limit @catch_conn_reset @catch_timeout @catch_gzip_errors def get(self, *args, **kwargs): if not self.client: self.connect() # set default tweet_mode; only used for non-premium/non-gnip endpoints if self.is_standard_v1(args[0]): if "params" not in kwargs: kwargs["params"] = {"tweet_mode": self.tweet_mode} else: kwargs["params"]["tweet_mode"] = self.tweet_mode # Pass allow 404 to not retry on 404 allow_404 = kwargs.pop("allow_404", False) connection_error_count = kwargs.pop("connection_error_count", 0) try: log.info("getting %s %s", args, kwargs) r = self.last_response = self.client.get( *args, timeout=(3.05, 31), **kwargs ) # this has been noticed, believe it or not # https://github.com/edsu/twarc/issues/75 if r.status_code == 404 and not allow_404: log.warning("404 from Twitter API! trying again") time.sleep(1) r = self.get(*args, **kwargs) return r except (ssl.SSLError, ConnectionError, ProtocolError) as e: connection_error_count += 1 log.error("caught connection error %s on %s try", e, connection_error_count) if ( self.connection_errors and connection_error_count == self.connection_errors ): log.error("received too many connection errors") raise e else: self.connect() kwargs["connection_error_count"] = connection_error_count kwargs["allow_404"] = allow_404 return self.get(*args, **kwargs) @rate_limit @catch_conn_reset @catch_timeout @catch_gzip_errors def post(self, *args, **kwargs): if not self.client: self.connect() if "data" in kwargs: kwargs["data"]["tweet_mode"] = self.tweet_mode connection_error_count = kwargs.pop("connection_error_count", 0) try: log.info("posting %s %s", args, kwargs) self.last_response = self.client.post(*args, timeout=(3.05, 31), **kwargs) return self.last_response except (ssl.SSLError, ConnectionError, ProtocolError) as e: connection_error_count += 1 log.error("caught connection error %s on %s try", e, connection_error_count) if ( self.connection_errors and connection_error_count == self.connection_errors ): log.error("received too many connection errors") raise e else: self.connect() kwargs["connection_error_count"] = connection_error_count return self.post(*args, **kwargs) @catch_timeout def connect(self): """ Sets up the HTTP session to talk to Twitter. If one is active it is closed and another one is opened. """ if self.gnip_auth and not ( self.gnip_username and self.gnip_password and self.gnip_account ): raise RuntimeError("MissingKeys") elif not self.gnip_auth and not ( self.consumer_key and self.consumer_secret and self.access_token and self.access_token_secret ): raise RuntimeError("MissingKeys") if self.client: log.info("closing existing http session") self.client.close() if self.last_response: log.info("closing last response") self.last_response.close() log.info("creating http session") if self.gnip_auth: logging.info("creating basic user authentication for gnip") s = requests.Session() s.auth = (self.gnip_username, self.gnip_password) self.client = s elif not self.app_auth: logging.info("creating OAuth1 user authentication") self.client = OAuth1Session( client_key=self.consumer_key, client_secret=self.consumer_secret, resource_owner_key=self.access_token, resource_owner_secret=self.access_token_secret, ) else: logging.info("creating OAuth2 app authentication") client = BackendApplicationClient(client_id=self.consumer_key) oauth = OAuth2Session(client=client) token = oauth.fetch_token( token_url="https://api.twitter.com/oauth2/token", client_id=self.consumer_key, client_secret=self.consumer_secret, ) self.client = oauth if self.client: self.client.headers.update({"User-Agent": user_agent}) def get_keys(self): """ Get the Twitter API keys. Order of precedence is command line, environment, config file. Return True if all the keys were found and False if not. """ env = os.environ.get if not self.consumer_key: self.consumer_key = env("CONSUMER_KEY") if not self.consumer_secret: self.consumer_secret = env("CONSUMER_SECRET") if not self.access_token: self.access_token = env("ACCESS_TOKEN") if not self.access_token_secret: self.access_token_secret = env("ACCESS_TOKEN_SECRET") if not self.gnip_username: self.gnip_username = env("GNIP_USERNAME") if not self.gnip_password: self.gnip_password = env("GNIP_PASSWORD") if not self.gnip_account: self.gnip_account = env("GNIP_ACCOUNT") if self.config: if self.gnip_auth and not ( self.gnip_username and self.gnip_password and self.gnip_account ): self.load_config() elif not self.gnip_auth and not ( self.consumer_key and self.consumer_secret and self.access_token and self.access_token_secret ): self.load_config() def validate_keys(self): """ Validate the keys provided are authentic credentials. """ if self.gnip_auth: url = "https://gnip-api.twitter.com/metrics/usage/accounts/{}.json".format( self.gnip_account ) keys_present = ( self.gnip_account and self.gnip_username and self.gnip_password ) elif self.app_auth: # no need to validate keys when using OAuth2 App Auth. return True else: url = "https://api.twitter.com/1.1/account/verify_credentials.json" keys_present = ( self.consumer_key and self.consumer_secret and self.access_token and self.access_token_secret ) if keys_present: try: # Need to explicitly reconnect to confirm the current creds # are used in the session object. self.connect() self.get(url) return True except requests.HTTPError as e: if e.response.status_code == 401: raise RuntimeError("Invalid credentials provided.") else: raise e else: print("Incomplete credentials provided.") print('Please run the command "twarc configure" to get started.') sys.exit() def load_config(self): path = self.config profile = self.profile log.info("loading %s profile from config %s", profile, path) if not path or not os.path.isfile(path): return {} config = configparser.ConfigParser() config.read(self.config) if len(config.sections()) >= 1 and not profile: profile = config.sections()[0] data = {} keys = ( ["gnip_username", "gnip_password", "gnip_account"] if self.gnip_auth else [ "access_token", "access_token_secret", "consumer_key", "consumer_secret", ] ) for key in keys: try: setattr(self, key, config.get(profile, key)) except configparser.NoSectionError: sys.exit("no such profile %s in %s" % (profile, path)) except configparser.NoOptionError: sys.exit("missing %s from profile %s in %s" % (key, profile, path)) return data def save_config(self, profile): if not self.config: return config = configparser.ConfigParser() config.read(self.config) if config.has_section(profile): config.remove_section(profile) config.add_section(profile) if self.gnip_auth: config.set(profile, "gnip_username", self.access_token_secret) config.set(profile, "gnip_password", self.access_token_secret) config.set(profile, "gnip_account", self.access_token_secret) else: config.set(profile, "consumer_key", self.consumer_key) config.set(profile, "consumer_secret", self.consumer_secret) config.set(profile, "access_token", self.access_token) config.set(profile, "access_token_secret", self.access_token_secret) with open(self.config, "w") as config_file: config.write(config_file) return config def configure(self): print( "\nTwarc needs to know a few things before it can talk to Twitter on your behalf.\n" ) reuse = False if self.consumer_key and self.consumer_secret: print( "You already have these application keys in your config %s\n" % self.config ) print("consumer key: %s" % self.consumer_key) print("consumer secret: %s" % self.consumer_secret) reuse = get_input( "\nWould you like to use those for your new profile? [y/n] " ) reuse = reuse.lower() == "y" if not reuse: print( "\nPlease enter your Twitter application credentials from apps.twitter.com:\n" ) self.consumer_key = get_input("consumer key: ") self.consumer_secret = get_input("consumer secret: ") answered = False while not answered: print( "\nHow would you like twarc to obtain your user keys?\n\n1) generate access keys by visiting Twitter\n2) manually enter your access token and secret\n" ) answer = get_input("Please enter your choice [1/2] ") if answer == "1": answered = True generate = True elif answer == "2": answered = True generate = False if generate: request_token_url = "https://api.twitter.com/oauth/request_token" oauth = OAuth1(self.consumer_key, client_secret=self.consumer_secret) r = requests.post(url=request_token_url, auth=oauth) credentials = parse_qs(r.text) if not credentials: print("\nError: invalid credentials.") print( "Please check that you are copying and pasting correctly and try again.\n" ) return resource_owner_key = credentials.get("oauth_token")[0] resource_owner_secret = credentials.get("oauth_token_secret")[0] base_authorization_url = "https://api.twitter.com/oauth/authorize" authorize_url = ( base_authorization_url + "?oauth_token=" + resource_owner_key ) print( "\nPlease log into Twitter and visit this URL in your browser:\n%s" % authorize_url ) verifier = get_input( "\nAfter you have authorized the application please enter the displayed PIN: " ) access_token_url = "https://api.twitter.com/oauth/access_token" oauth = OAuth1( self.consumer_key, client_secret=self.consumer_secret, resource_owner_key=resource_owner_key, resource_owner_secret=resource_owner_secret, verifier=verifier, ) r = requests.post(url=access_token_url, auth=oauth) credentials = parse_qs(r.text) if not credentials: print("\nError: invalid PIN") print( "Please check that you entered the PIN correctly and try again.\n" ) return self.access_token = resource_owner_key = credentials.get("oauth_token")[0] self.access_token_secret = credentials.get("oauth_token_secret")[0] screen_name = credentials.get("screen_name")[0] else: self.access_token = get_input("Enter your Access Token: ") self.access_token_secret = get_input("Enter your Access Token Secret: ") screen_name = "default" config = self.save_config(screen_name) print( "\nThe credentials for %s have been saved to your configuration file at %s" % (screen_name, self.config) ) print("\n✨ ✨ ✨ Happy twarcing! ✨ ✨ ✨\n") if len(config.sections()) > 1: print( "Note: you have multiple profiles in %s so in order to use %s you will use --profile\n" % (self.config, screen_name) ) def default_config(self): return os.path.join(os.path.expanduser("~"), ".twarc") def is_standard_v1(self, url): result = True if url.startswith("https://gnip-api.twitter.com"): result = False elif url.startswith("https://api.twitter.com/1.1/tweets/search/30day"): result = False elif url.startswith("https://api.twitter.com/1.1/tweets/search/fullarchive"): result = False return result ================================================ FILE: src/twarc/client2.py ================================================ # -*- coding: utf-8 -*- """ Support for the Twitter v2 API. """ import re import json import time import logging import datetime import requests from oauthlib.oauth2 import BackendApplicationClient from requests_oauthlib import OAuth1Session, OAuth2Session from twarc.expansions import ( EXPANSIONS, TWEET_FIELDS, USER_FIELDS, MEDIA_FIELDS, POLL_FIELDS, PLACE_FIELDS, LIST_FIELDS, ) from twarc.decorators2 import * from twarc.version import version, user_agent log = logging.getLogger("twarc") class Twarc2: """ A client for the Twitter v2 API. """ def __init__( self, consumer_key=None, consumer_secret=None, access_token=None, access_token_secret=None, bearer_token=None, connection_errors=0, metadata=True, ): """ Instantiate a Twarc2 instance to talk to the Twitter V2+ API. The client can use either App or User authentication, but only one at a time. Whether app auth or user auth is used depends on which credentials are provided on initialisation: 1. If a `bearer_token` is passed, app auth is always used. 2. If a `consumer_key` and `consumer_secret` are passed without an `access_token` and `access_token_secret`, app auth is used. 3. If `consumer_key`, `consumer_secret`, `access_token` and `access_token_secret` are all passed, then user authentication is used instead. Args: consumer_key (str): The API key. consumer_secret (str): The API secret. access_token (str): The Access Token access_token_secret (str): The Access Token Secret bearer_token (str): Bearer Token, can be generated from API keys. connection_errors (int): Number of retries for GETs metadata (bool): Append `__twarc` metadata to results. """ self.api_version = "2" self.connection_errors = connection_errors self.metadata = metadata self.bearer_token = None if bearer_token: self.bearer_token = bearer_token self.auth_type = "application" elif consumer_key and consumer_secret: if access_token and access_token_secret: self.consumer_key = consumer_key self.consumer_secret = consumer_secret self.access_token = access_token self.access_token_secret = access_token_secret self.auth_type = "user" else: self.consumer_key = consumer_key self.consumer_secret = consumer_secret self.auth_type = "application" else: raise ValueError( "Must pass either a bearer_token or consumer/access_token keys and secrets" ) self.client = None self.last_response = None self.connect() def _prepare_params(self, **kwargs): """ Prepare URL parameters and defaults for fields and expansions and others """ params = {} # Defaults for fields and expansions if "expansions" in kwargs: params["expansions"] = ( kwargs.pop("expansions") if kwargs["expansions"] else ",".join(EXPANSIONS) ) if "tweet_fields" in kwargs: params["tweet.fields"] = ( kwargs.pop("tweet_fields") if kwargs["tweet_fields"] else ",".join(TWEET_FIELDS) ) if "user_fields" in kwargs: params["user.fields"] = ( kwargs.pop("user_fields") if kwargs["user_fields"] else ",".join(USER_FIELDS) ) if "media_fields" in kwargs: params["media.fields"] = ( kwargs.pop("media_fields") if kwargs["media_fields"] else ",".join(MEDIA_FIELDS) ) if "poll_fields" in kwargs: params["poll.fields"] = ( kwargs.pop("poll_fields") if kwargs["poll_fields"] else ",".join(POLL_FIELDS) ) if "place_fields" in kwargs: params["place.fields"] = ( kwargs.pop("place_fields") if kwargs["place_fields"] else ",".join(PLACE_FIELDS) ) if "list_fields" in kwargs: params["list.fields"] = ( kwargs.pop("list_fields") if kwargs["list_fields"] else ",".join(LIST_FIELDS) ) # Format start_time and end_time if "start_time" in kwargs: start_time = kwargs["start_time"] params["start_time"] = ( _ts(kwargs.pop("start_time")) if start_time and not isinstance(start_time, str) else start_time ) if "end_time" in kwargs: end_time = kwargs["end_time"] params["end_time"] = ( _ts(kwargs.pop("end_time")) if end_time and not isinstance(end_time, str) else end_time ) # Any other parameters passed as is, # these include backfill_minutes, next_token, pagination_token, sort_order params = {**params, **{k: v for k, v in kwargs.items() if v is not None}} return params def _search( self, url, query, since_id, until_id, start_time, end_time, max_results, expansions, tweet_fields, user_fields, media_fields, poll_fields, place_fields, sort_order, next_token=None, granularity=None, sleep_between=0, ): """ Common function for search, counts endpoints. """ params = self._prepare_params( query=query, max_results=max_results, since_id=since_id, until_id=until_id, start_time=start_time, end_time=end_time, next_token=next_token, sort_order=sort_order, ) if granularity: # Do not specify anything else when calling counts endpoint params["granularity"] = granularity # Mark that we're using counts, to workaround a limitation of the # Twitter API with long running counts. using_counts = True # We need to use these as sentinel values, to differentiate # between the count API returning zero prematurely, and queries # like "from:". In the latter case # instead of returning counts of 0 per day, it will just return # an empty response with a total tweet count of zero. We can # disambiguate the two cases by noting that the premature # termination will already have counted some tweets correctly, # while the latter will return immediately without any data # rows. time_periods_collected = 0 last_time_start = None else: params = self._prepare_params( **params, expansions=expansions, tweet_fields=tweet_fields, user_fields=user_fields, media_fields=media_fields, poll_fields=poll_fields, place_fields=place_fields, ) using_counts = False # Workaround for observed odd behaviour in the Twitter counts # functionality. if using_counts: while True: for response in self.get_paginated(url, params=params): # Note that we're ensuring the appropriate amount of sleep is # taken before yielding every item. This ensures that we won't # exceed the rate limit even in cases where a response generator # is not completely consumed. This might be more conservative # than necessary. time.sleep(sleep_between) # can't return without 'data' if there are no results if "data" in response: last_time_start = response["data"][0]["start"] time_periods_collected += len(response["data"]) yield response else: log.info(f"Retrieved an empty page of results.") # Check that we've actually reached the end, and restart if necessary. # Note we need to exactly match the Twitter format, which is a little # fiddly because Python doesn't let you specify milliseconds only for # strftime. if ( # If there's no explicit start time we're getting the last # 30 days by default, so don't need to do the tricky # things. start_time is None # We've actually reached the specified start time or ( (start_time.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z") == last_time_start ) # Or, we've hit one of the special cases that returns no rows # of data, and immediately indicates zero tweets returned, like # searching for a tweet that doesn't exist. or (time_periods_collected == 0) ): break else: # Note that we're passing the Twitter start_time straight # back to it - this avoids parsing and reformatting the date. params["end_time"] = last_time_start # Remove the next_token reference, we're restarting the search. if "next_token" in params: del params["next_token"] log.info( "Detected incomplete counts, restarting with " f"{last_time_start} as the new end_time" ) else: for response in self.get_paginated(url, params=params): # Note that we're ensuring the appropriate amount of sleep is # taken before yielding every item. This ensures that we won't # exceed the rate limit even in cases where a response generator # is not completely consumed. This might be more conservative # than necessary. time.sleep(sleep_between) # can't return without 'data' if there are no results if "data" in response: yield response else: log.info(f"Retrieved an empty page of results.") log.info(f"No more results for search {query}.") def _lists( self, url, expansions=None, list_fields=None, user_fields=None, max_results=None, pagination_token=None, ): """ Paginates and returns lists """ params = self._prepare_params( list_fields=list_fields, user_fields=user_fields, max_results=max_results, pagination_token=pagination_token, ) if expansions: params["expansions"] = "owner_id" for response in self.get_paginated(url, params=params): # can return without 'data' if there are no results if "data" in response: yield response else: log.info(f"Retrieved an empty page of results of lists for {url}") def list_followers( self, list_id, expansions=None, tweet_fields=None, user_fields=None, max_results=None, pagination_token=None, ): """ Returns a list of users who are followers of the specified List. Calls [GET /2/lists/:id/followers](https://developer.twitter.com/en/docs/twitter-api/lists/list-follows/api-reference/get-lists-id-followers) Args: list_id (int): ID of the list. expansions enum (pinned_tweet_id): Expansions, include pinned tweets. max_results (int): the maximum number of results to retrieve. Between 1 and 100. Default is 100. Returns: generator[dict]: A generator, dict for each page of results. """ params = self._prepare_params( tweet_fields=tweet_fields, user_fields=user_fields, max_results=max_results, pagination_token=pagination_token, ) if expansions: params["expansions"] = "pinned_tweet_id" url = f"https://api.twitter.com/2/lists/{list_id}/followers" return self.get_paginated(url, params=params) def list_members( self, list_id, expansions=None, tweet_fields=None, user_fields=None, max_results=None, pagination_token=None, ): """ Returns a list of users who are members of the specified List. Calls [GET /2/lists/:id/members](https://developer.twitter.com/en/docs/twitter-api/lists/list-members/api-reference/get-lists-id-members) Args: list_id (int): ID of the list. expansions enum (pinned_tweet_id): Expansions, include pinned tweets. max_results (int): The maximum number of results to be returned per page. This can be a number between 1 and 100. pagination_token (string): Used to request the next page of results if all results weren't returned with the latest request, or to go back to the previous page of results. Returns: generator[dict]: A generator, dict for each page of results. """ params = self._prepare_params( tweet_fields=tweet_fields, user_fields=user_fields, max_results=max_results, pagination_token=pagination_token, ) if expansions: params["expansions"] = "pinned_tweet_id" url = f"https://api.twitter.com/2/lists/{list_id}/members" return self.get_paginated(url, params=params) def list_memberships( self, user, expansions=None, list_fields=None, user_fields=None, max_results=None, pagination_token=None, ): """ Returns all Lists a specified user is a member of. Calls [GET /2/users/:id/list_memberships](https://developer.twitter.com/en/docs/twitter-api/lists/list-members/api-reference/get-users-id-list_memberships) Args: user (int): ID of the user. expansions enum (owner_id): enable you to request additional data objects that relate to the originally returned List. list_fields enum (created_at, follower_count, member_count, private, description, owner_id): This fields parameter enables you to select which specific List fields will deliver with each returned List objects. user_fields enum (created_at, description, entities, id, location, name, pinned_tweet_id, profile_image_url, protected, public_metrics, url, username, verified, withheld): This fields parameter enables you to select which specific user fields will deliver with the users object. Specify the desired fields in a comma-separated list without spaces between commas and fields. max_results (int): The maximum number of results to be returned per page. This can be a number between 1 and 100. pagination_token (string): Used to request the next page of results if all results weren't returned with the latest request, or to go back to the previous page of results. Returns: generator[dict]: A generator, dict for each page of results. """ user_id = self._ensure_user_id(user) url = f"https://api.twitter.com/2/users/{user_id}/list_memberships" return self._lists( url=url, expansions=expansions, list_fields=list_fields, user_fields=user_fields, max_results=max_results, pagination_token=pagination_token, ) def owned_lists( self, user, expansions=None, list_fields=None, user_fields=None, max_results=None, pagination_token=None, ): """ Returns all Lists owned by the specified user. Calls [GET /2/users/:id/owned_lists](https://developer.twitter.com/en/docs/twitter-api/lists/list-lookup/api-reference/get-users-id-owned_lists) Args: user (int): ID of the user. expansions enum (owner_id): enable you to request additional data objects that relate to the originally returned List. list_fields enum (created_at, follower_count, member_count, private, description, owner_id): This fields parameter enables you to select which specific List fields will deliver with each returned List objects. user_fields enum (created_at, description, entities, id, location, name, pinned_tweet_id, profile_image_url, protected, public_metrics, url, username, verified, withheld): This fields parameter enables you to select which specific user fields will deliver with the users object. Specify the desired fields in a comma-separated list without spaces between commas and fields. max_results (int): The maximum number of results to be returned per page. This can be a number between 1 and 100. pagination_token (string): Used to request the next page of results if all results weren't returned with the latest request, or to go back to the previous page of results. Returns: generator[dict]: A generator, dict for each page of results. """ user_id = self._ensure_user_id(user) url = f"https://api.twitter.com/2/users/{user_id}/owned_lists" return self._lists( url=url, expansions=expansions, list_fields=list_fields, user_fields=user_fields, max_results=max_results, pagination_token=pagination_token, ) def followed_lists( self, user, expansions=None, list_fields=None, user_fields=None, max_results=None, pagination_token=None, ): """ Returns all Lists a specified user follows. Calls [GET /2/users/:id/followed_lists](https://developer.twitter.com/en/docs/twitter-api/lists/list-follows/api-reference/get-users-id-followed_lists) Args: user (int): ID of the user. expansions enum (owner_id): enable you to request additional data objects that relate to the originally returned List. list_fields enum (created_at, follower_count, member_count, private, description, owner_id): This fields parameter enables you to select which specific List fields will deliver with each returned List objects. user_fields enum (created_at, description, entities, id, location, name, pinned_tweet_id, profile_image_url, protected, public_metrics, url, username, verified, withheld): This fields parameter enables you to select which specific user fields will deliver with the users object. Specify the desired fields in a comma-separated list without spaces between commas and fields. max_results (int): The maximum number of results to be returned per page. This can be a number between 1 and 100. pagination_token (string): Used to request the next page of results if all results weren't returned with the latest request, or to go back to the previous page of results. Returns: generator[dict]: A generator, dict for each page of results. """ user_id = self._ensure_user_id(user) url = f"https://api.twitter.com/2/users/{user_id}/followed_lists" return self._lists( url=url, expansions=expansions, list_fields=list_fields, user_fields=user_fields, max_results=max_results, pagination_token=pagination_token, ) def pinned_lists( self, user, expansions=None, list_fields=None, user_fields=None, max_results=None, pagination_token=None, ): """ Returns the Lists pinned by the authenticating user. Does not work with a Bearer token. Calls [GET /2/users/:id/pinned_lists](https://developer.twitter.com/en/docs/twitter-api/lists/pinned-lists/api-reference/get-users-id-pinned_lists) Args: user (int): ID of the user. expansions enum (owner_id): enable you to request additional data objects that relate to the originally returned List. list_fields enum (created_at, follower_count, member_count, private, description, owner_id): This fields parameter enables you to select which specific List fields will deliver with each returned List objects. user_fields enum (created_at, description, entities, id, location, name, pinned_tweet_id, profile_image_url, protected, public_metrics, url, username, verified, withheld): This fields parameter enables you to select which specific user fields will deliver with the users object. Specify the desired fields in a comma-separated list without spaces between commas and fields. max_results (int): The maximum number of results to be returned per page. This can be a number between 1 and 100. pagination_token (string): Used to request the next page of results if all results weren't returned with the latest request, or to go back to the previous page of results. Returns: generator[dict]: A generator, dict for each page of results. """ user_id = self._ensure_user_id(user) url = f"https://api.twitter.com/2/users/{user_id}/pinned_lists" return self._lists( url=url, expansions=expansions, list_fields=list_fields, user_fields=user_fields, max_results=max_results, pagination_token=pagination_token, ) def list_lookup(self, list_id, expansions=None, list_fields=None, user_fields=None): """ Returns the details of a specified List. Calls [GET /2/lists/:id](https://developer.twitter.com/en/docs/twitter-api/lists/list-lookup/api-reference/get-lists-id) Args: list_id (int): ID of the list. expansions enum (owner_id): enable you to request additional data objects that relate to the originally returned List. list_fields enum (created_at, follower_count, member_count, private, description, owner_id): This fields parameter enables you to select which specific List fields will deliver with each returned List objects. user_fields enum (created_at, description, entities, id, location, name, pinned_tweet_id, profile_image_url, protected, public_metrics, url, username, verified, withheld): This fields parameter enables you to select which specific user fields will deliver with the users object. Specify the desired fields in a comma-separated list without spaces between commas and fields. Returns: dict: Result dictionary. """ params = self._prepare_params( list_fields=list_fields, user_fields=user_fields, ) if expansions: params["expansions"] = "owner_id" url = f"https://api.twitter.com/2/lists/{list_id}" resp = self.get(url, params=params) data = resp.json() if self.metadata: data = _append_metadata(data, resp.url) return data def list_tweets( self, list_id, expansions=None, tweet_fields=None, user_fields=None, max_results=None, pagination_token=None, ): """ Returns Tweets from the specified List. Calls [GET /2/lists/:id/tweets](https://developer.twitter.com/en/docs/twitter-api/lists/list-tweets/api-reference/get-lists-id-tweets) Args: list_id (int): ID of the list. expansions enum (author_id): enable you to request additional data objects that relate to the originally returned List. list_fields enum (created_at, follower_count, member_count, private, description, owner_id): This fields parameter enables you to select which specific List fields will deliver with each returned List objects. user_fields enum (created_at, description, entities, id, location, name, pinned_tweet_id, profile_image_url, protected, public_metrics, url, username, verified, withheld): This fields parameter enables you to select which specific user fields will deliver with the users object. Specify the desired fields in a comma-separated list without spaces between commas and fields. Returns: generator[dict]: A generator, dict for each page of results. """ params = self._prepare_params( expansions=expansions, tweet_fields=tweet_fields, user_fields=user_fields, max_results=max_results, pagination_token=pagination_token, ) url = f"https://api.twitter.com/2/lists/{list_id}/tweets" return self.get_paginated(url, params=params) def search_recent( self, query, since_id=None, until_id=None, start_time=None, end_time=None, max_results=100, expansions=None, tweet_fields=None, user_fields=None, media_fields=None, poll_fields=None, place_fields=None, next_token=None, sort_order=None, ): """ Search Twitter for the given query in the last seven days, using the `/search/recent` endpoint. Calls [GET /2/tweets/search/recent](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-recent) Args: query (str): The query string to be passed directly to the Twitter API. since_id (int): Return all tweets since this tweet_id. until_id (int): Return all tweets up to this tweet_id. start_time (datetime): Return all tweets after this time (UTC datetime). end_time (datetime): Return all tweets before this time (UTC datetime). max_results (int): The maximum number of results per request. Max is 100. sort_order (str): Order tweets based on relevancy or recency. Returns: generator[dict]: a generator, dict for each paginated response. """ return self._search( url="https://api.twitter.com/2/tweets/search/recent", query=query, since_id=since_id, until_id=until_id, start_time=start_time, end_time=end_time, max_results=max_results, expansions=expansions, tweet_fields=tweet_fields, user_fields=user_fields, media_fields=media_fields, poll_fields=poll_fields, place_fields=place_fields, next_token=next_token, sort_order=sort_order, ) @requires_app_auth def search_all( self, query, since_id=None, until_id=None, start_time=None, end_time=None, max_results=100, # temp fix for #504 expansions=None, tweet_fields=None, user_fields=None, media_fields=None, poll_fields=None, place_fields=None, next_token=None, sort_order=None, ): """ Search Twitter for the given query in the full archive, using the `/search/all` endpoint (Requires Academic Access). Calls [GET /2/tweets/search/all](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all) Args: query (str): The query string to be passed directly to the Twitter API. since_id (int): Return all tweets since this tweet_id. until_id (int): Return all tweets up to this tweet_id. start_time (datetime): Return all tweets after this time (UTC datetime). If none of start_time, since_id, or until_id are specified, this defaults to 2006-3-21 to search the entire history of Twitter. end_time (datetime): Return all tweets before this time (UTC datetime). max_results (int): The maximum number of results per request. Max is 500. sort_order (str): Order tweets based on relevancy or recency. Returns: generator[dict]: a generator, dict for each paginated response. """ # start time defaults to the beginning of Twitter to override the # default of the last month. Only do this if start_time is not already # specified and since_id and until_id aren't being used if start_time is None and since_id is None and until_id is None: start_time = datetime.datetime(2006, 3, 21, tzinfo=datetime.timezone.utc) return self._search( url="https://api.twitter.com/2/tweets/search/all", query=query, since_id=since_id, until_id=until_id, start_time=start_time, end_time=end_time, max_results=max_results, expansions=expansions, tweet_fields=tweet_fields, user_fields=user_fields, media_fields=media_fields, poll_fields=poll_fields, place_fields=place_fields, next_token=next_token, sleep_between=1.05, sort_order=sort_order, ) @requires_app_auth def counts_recent( self, query, since_id=None, until_id=None, start_time=None, end_time=None, granularity="hour", ): """ Retrieve counts for the given query in the last seven days, using the `/counts/recent` endpoint. Calls [GET /2/tweets/counts/recent](https://developer.twitter.com/en/docs/twitter-api/tweets/counts/api-reference/get-tweets-counts-recent) Args: query (str): The query string to be passed directly to the Twitter API. since_id (int): Return all tweets since this tweet_id. until_id (int): Return all tweets up to this tweet_id. start_time (datetime): Return all tweets after this time (UTC datetime). end_time (datetime): Return all tweets before this time (UTC datetime). granularity (str): Count aggregation level: `day`, `hour`, `minute`. Default is `hour`. Returns: generator[dict]: a generator, dict for each paginated response. """ return self._search( url="https://api.twitter.com/2/tweets/counts/recent", query=query, since_id=since_id, until_id=until_id, start_time=start_time, end_time=end_time, max_results=None, expansions=None, tweet_fields=None, user_fields=None, media_fields=None, poll_fields=None, place_fields=None, granularity=granularity, sort_order=None, ) @requires_app_auth def counts_all( self, query, since_id=None, until_id=None, start_time=None, end_time=None, granularity="hour", next_token=None, ): """ Retrieve counts for the given query in the full archive, using the `/search/all` endpoint (Requires Academic Access). Calls [GET /2/tweets/counts/all](https://developer.twitter.com/en/docs/twitter-api/tweets/counts/api-reference/get-tweets-counts-all) Args: query (str): The query string to be passed directly to the Twitter API. since_id (int): Return all tweets since this tweet_id. until_id (int): Return all tweets up to this tweet_id. start_time (datetime): Return all tweets after this time (UTC datetime). end_time (datetime): Return all tweets before this time (UTC datetime). granularity (str): Count aggregation level: `day`, `hour`, `minute`. Default is `hour`. Returns: generator[dict]: a generator, dict for each paginated response. """ return self._search( url="https://api.twitter.com/2/tweets/counts/all", query=query, since_id=since_id, until_id=until_id, start_time=start_time, end_time=end_time, max_results=None, expansions=None, tweet_fields=None, user_fields=None, media_fields=None, poll_fields=None, place_fields=None, next_token=next_token, granularity=granularity, sleep_between=1.05, sort_order=None, ) def tweet_lookup( self, tweet_ids, expansions=None, tweet_fields=None, user_fields=None, media_fields=None, poll_fields=None, place_fields=None, ): """ Lookup tweets, taking an iterator of IDs and returning pages of fully expanded tweet objects. This can be used to rehydrate a collection shared as only tweet IDs. Yields one page of tweets at a time, in blocks of up to 100. Calls [GET /2/tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/lookup/api-reference/get-tweets) Args: tweet_ids (iterable): A list of tweet IDs Returns: generator[dict]: a generator, dict for each batch of 100 tweets. """ def lookup_batch(tweet_id): url = "https://api.twitter.com/2/tweets" params = self._prepare_params( expansions=expansions, tweet_fields=tweet_fields, user_fields=user_fields, media_fields=media_fields, poll_fields=poll_fields, place_fields=place_fields, ) params["ids"] = ",".join(tweet_id) resp = self.get(url, params=params) data = resp.json() if self.metadata: data = _append_metadata(data, resp.url) return data tweet_id_batch = [] for tweet_id in tweet_ids: tweet_id_batch.append(str(int(tweet_id))) if len(tweet_id_batch) == 100: yield lookup_batch(tweet_id_batch) tweet_id_batch = [] if tweet_id_batch: yield (lookup_batch(tweet_id_batch)) def user_lookup( self, users, usernames=False, expansions=None, tweet_fields=None, user_fields=None, ): """ Returns fully populated user profiles for the given iterator of user_id or usernames. By default user_lookup expects user ids but if you want to pass in usernames set usernames = True. Yields one page of results at a time (in blocks of at most 100 user profiles). Calls [GET /2/users](https://developer.twitter.com/en/docs/twitter-api/users/lookup/api-reference/get-users) Args: users (iterable): User IDs or usernames to lookup. usernames (bool): Parse `users` as usernames, not IDs. Returns: generator[dict]: a generator, dict for each batch of 100 users. """ if isinstance(users, str): raise TypeError("users must be an iterable other than a string") if usernames: url = "https://api.twitter.com/2/users/by" else: url = "https://api.twitter.com/2/users" def lookup_batch(users): params = self._prepare_params( tweet_fields=tweet_fields, user_fields=user_fields, ) if expansions: params["expansions"] = "pinned_tweet_id" if usernames: params["usernames"] = ",".join(users) else: params["ids"] = ",".join(users) resp = self.get(url, params=params) data = resp.json() if self.metadata: data = _append_metadata(data, resp.url) return data batch = [] for item in users: batch.append(str(item).strip()) if len(batch) == 100: yield lookup_batch(batch) batch = [] if batch: yield (lookup_batch(batch)) @catch_request_exceptions @requires_app_auth def sample( self, event=None, record_keepalive=False, expansions=None, tweet_fields=None, user_fields=None, media_fields=None, poll_fields=None, place_fields=None, backfill_minutes=None, ): """ Returns a sample of all publicly posted tweets. The sample is based on slices of each second, not truly randomised. The same tweets are returned for all users of this endpoint. If a `threading.Event` is provided and the event is set, the sample will be interrupted. This can be used for coordination with other programs. Calls [GET /2/tweets/sample/stream](https://developer.twitter.com/en/docs/twitter-api/tweets/sampled-stream/api-reference/get-tweets-sample-stream) Args: event (threading.Event): Manages a flag to stop the process. record_keepalive (bool): whether to output keep-alive events. Returns: generator[dict]: a generator, dict for each tweet. """ url = "https://api.twitter.com/2/tweets/sample/stream" params = self._prepare_params( expansions=expansions, tweet_fields=tweet_fields, user_fields=user_fields, media_fields=media_fields, poll_fields=poll_fields, place_fields=place_fields, backfill_minutes=backfill_minutes, ) yield from self._stream(url, params, event, record_keepalive) @requires_app_auth def add_stream_rules(self, rules): """ Adds new rules to the filter stream. Calls [POST /2/tweets/search/stream/rules](https://developer.twitter.com/en/docs/twitter-api/tweets/filtered-stream/api-reference/post-tweets-search-stream-rules) Args: rules (list[dict]): A list of rules to add. Returns: dict: JSON Response from Twitter API. """ url = "https://api.twitter.com/2/tweets/search/stream/rules" return self.post(url, {"add": rules}).json() @requires_app_auth def get_stream_rules(self): """ Returns a list of rules for the filter stream. Calls [GET /2/tweets/search/stream/rules](https://developer.twitter.com/en/docs/twitter-api/tweets/filtered-stream/api-reference/get-tweets-search-stream-rules) Returns: dict: JSON Response from Twitter API with a list of defined rules. """ url = "https://api.twitter.com/2/tweets/search/stream/rules" return self.get(url).json() @requires_app_auth def delete_stream_rule_ids(self, rule_ids): """ Deletes rules from the filter stream. Calls [POST /2/tweets/search/stream/rules](https://developer.twitter.com/en/docs/twitter-api/tweets/filtered-stream/api-reference/post-tweets-search-stream-rules) Args: rule_ids (list[int]): A list of rule ids to delete. Returns: dict: JSON Response from Twitter API. """ url = "https://api.twitter.com/2/tweets/search/stream/rules" return self.post(url, {"delete": {"ids": rule_ids}}).json() @requires_app_auth def stream( self, event=None, record_keepalive=False, expansions=None, tweet_fields=None, user_fields=None, media_fields=None, poll_fields=None, place_fields=None, backfill_minutes=None, ): """ Returns a stream of tweets matching the defined rules. Rules can be added or removed out-of-band, without disconnecting. Tweet results will contain metadata about the rule that matched it. If event is set with a threading.Event object, the sample stream will be interrupted. This can be used for coordination with other programs. Calls [GET /2/tweets/search/stream](https://developer.twitter.com/en/docs/twitter-api/tweets/filtered-stream/api-reference/get-tweets-search-stream) Args: event (threading.Event): Manages a flag to stop the process. record_keepalive (bool): whether to output keep-alive events. Returns: generator[dict]: a generator, dict for each tweet. """ url = "https://api.twitter.com/2/tweets/search/stream" params = self._prepare_params( expansions=expansions, tweet_fields=tweet_fields, user_fields=user_fields, media_fields=media_fields, poll_fields=poll_fields, place_fields=place_fields, backfill_minutes=backfill_minutes, ) yield from self._stream(url, params, event, record_keepalive) def _stream(self, url, params, event, record_keepalive, tries=30): """ A generator that handles streaming data from a response and catches and logs any request exceptions, sleeps (exponential backoff) and restarts the stream. Args: url (str): the streaming endpoint URL params (dict): any query paramters to use with the url event (threading.Event): Manages a flag to stop the process. record_keepalive (bool): whether to output keep-alive events. tries (int): the number of times to retry connecting after an error Returns: generator[dict]: A generator of tweet dicts. """ errors = 0 while True: log.info(f"connecting to stream {url}") resp = self.get(url, params=params, stream=True) try: for line in resp.iter_lines(): errors = 0 # quit & close the stream if the event is set if event and event.is_set(): log.info("stopping response stream") resp.close() return # return the JSON data w/ optional keep-alive if not line: log.info("keep-alive") if record_keepalive: yield "keep-alive" continue else: data = json.loads(line.decode()) if self.metadata: data = _append_metadata(data, resp.url) yield data if self._check_for_disconnect(data): break except requests.exceptions.RequestException as e: log.warn("caught exception during streaming: %s", e) errors += 1 if errors > tries: log.error(f"too many consecutive errors ({tries}). stopping") return else: secs = errors**2 log.info("sleeping %s seconds before reconnecting", secs) time.sleep(secs) def _timeline( self, user_id, timeline_type, since_id, until_id, start_time, end_time, exclude_retweets, exclude_replies, max_results=None, expansions=None, tweet_fields=None, user_fields=None, media_fields=None, poll_fields=None, place_fields=None, pagination_token=None, ): """ Helper function for user and mention timelines Calls [GET /2/users/:id/tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/timelines/api-reference/get-users-id-tweets) or [GET /2/users/:id/mentions](https://developer.twitter.com/en/docs/twitter-api/tweets/timelines/api-reference/get-users-id-mentions) Args: user_id (int): ID of the user. timeline_type (str): timeline type: `tweets` or `mentions` since_id (int): results with a Tweet ID greater than (newer) than specified until_id (int): results with a Tweet ID less than (older) than specified start_time (datetime): oldest UTC timestamp from which the Tweets will be provided end_time (datetime): newest UTC timestamp from which the Tweets will be provided exclude_retweets (boolean): remove retweets from timeline exlucde_replies (boolean): remove replies from timeline Returns: generator[dict]: A generator, dict for each page of results. """ url = f"https://api.twitter.com/2/users/{user_id}/{timeline_type}" params = self._prepare_params( since_id=since_id, until_id=until_id, start_time=start_time, end_time=end_time, max_results=max_results, expansions=expansions, tweet_fields=tweet_fields, user_fields=user_fields, media_fields=media_fields, poll_fields=poll_fields, place_fields=place_fields, pagination_token=pagination_token, ) excludes = [] if exclude_retweets: excludes.append("retweets") if exclude_replies: excludes.append("replies") if len(excludes) > 0: params["exclude"] = ",".join(excludes) for response in self.get_paginated(url, params=params): # can return without 'data' if there are no results if "data" in response: yield response else: log.info(f"Retrieved an empty page of results for timeline {user_id}") log.info(f"No more results for timeline {user_id}.") def timeline( self, user, since_id=None, until_id=None, start_time=None, end_time=None, exclude_retweets=False, exclude_replies=False, max_results=100, expansions=None, tweet_fields=None, user_fields=None, media_fields=None, poll_fields=None, place_fields=None, pagination_token=None, ): """ Retrieve up to the 3200 most recent tweets made by the given user. Calls [GET /2/users/:id/tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/timelines/api-reference/get-users-id-tweets) Args: user (int): ID of the user. since_id (int): results with a Tweet ID greater than (newer) than specified until_id (int): results with a Tweet ID less than (older) than specified start_time (datetime): oldest UTC timestamp from which the Tweets will be provided end_time (datetime): newest UTC timestamp from which the Tweets will be provided exclude_retweets (boolean): remove retweets from timeline results exclude_replies (boolean): remove replies from timeline results max_results (int): the maximum number of Tweets to retrieve. Between 5 and 100. Returns: generator[dict]: A generator, dict for each page of results. """ user_id = self._ensure_user_id(user) return self._timeline( user_id=user_id, timeline_type="tweets", since_id=since_id, until_id=until_id, start_time=start_time, end_time=end_time, exclude_retweets=exclude_retweets, exclude_replies=exclude_replies, max_results=max_results, expansions=expansions, tweet_fields=tweet_fields, user_fields=user_fields, media_fields=media_fields, poll_fields=poll_fields, place_fields=place_fields, pagination_token=pagination_token, ) def mentions( self, user, since_id=None, until_id=None, start_time=None, end_time=None, exclude_retweets=False, exclude_replies=False, max_results=100, expansions=None, tweet_fields=None, user_fields=None, media_fields=None, poll_fields=None, place_fields=None, pagination_token=None, ): """ Retrieve up to the 800 most recent tweets mentioning the given user. Calls [GET /2/users/:id/mentions](https://developer.twitter.com/en/docs/twitter-api/tweets/timelines/api-reference/get-users-id-mentions) Args: user (int): ID of the user. since_id (int): results with a Tweet ID greater than (newer) than specified until_id (int): results with a Tweet ID less than (older) than specified start_time (datetime): oldest UTC timestamp from which the Tweets will be provided end_time (datetime): newest UTC timestamp from which the Tweets will be provided exclude_retweets (boolean): remove retweets from timeline results exclude_replies (boolean): remove replies from timeline results max_results (int): the maximum number of Tweets to retrieve. Between 5 and 100. Returns: generator[dict]: A generator, dict for each page of results. """ user_id = self._ensure_user_id(user) return self._timeline( user_id=user_id, timeline_type="mentions", since_id=since_id, until_id=until_id, start_time=start_time, end_time=end_time, exclude_retweets=exclude_retweets, exclude_replies=exclude_replies, max_results=max_results, expansions=expansions, tweet_fields=tweet_fields, user_fields=user_fields, media_fields=media_fields, poll_fields=poll_fields, place_fields=place_fields, pagination_token=pagination_token, ) def following( self, user, user_id=None, max_results=1000, expansions=None, tweet_fields=None, user_fields=None, pagination_token=None, ): """ Retrieve the user profiles of accounts followed by the given user. Calls [GET /2/users/:id/following](https://developer.twitter.com/en/docs/twitter-api/users/follows/api-reference/get-users-id-following) Args: user (int): ID of the user. Returns: generator[dict]: A generator, dict for each page of results. """ user_id = self._ensure_user_id(user) if not user_id else user_id params = self._prepare_params( tweet_fields=tweet_fields, user_fields=user_fields, max_results=max_results, pagination_token=pagination_token, ) if expansions: params["expansions"] = "pinned_tweet_id" url = f"https://api.twitter.com/2/users/{user_id}/following" return self.get_paginated(url, params=params) def followers( self, user, user_id=None, max_results=1000, expansions=None, tweet_fields=None, user_fields=None, pagination_token=None, ): """ Retrieve the user profiles of accounts following the given user. Calls [GET /2/users/:id/followers](https://developer.twitter.com/en/docs/twitter-api/users/follows/api-reference/get-users-id-followers) Args: user (int): ID of the user. Returns: generator[dict]: A generator, dict for each page of results. """ user_id = self._ensure_user_id(user) if not user_id else user_id params = self._prepare_params( tweet_fields=tweet_fields, user_fields=user_fields, max_results=max_results, pagination_token=pagination_token, ) if expansions: params["expansions"] = "pinned_tweet_id" url = f"https://api.twitter.com/2/users/{user_id}/followers" return self.get_paginated(url, params=params) def liking_users( self, tweet_id, expansions=None, tweet_fields=None, user_fields=None, max_results=100, pagination_token=None, ): """ Retrieve the user profiles of accounts that have liked the given tweet. """ url = f"https://api.twitter.com/2/tweets/{tweet_id}/liking_users" params = self._prepare_params( tweet_fields=tweet_fields, user_fields=user_fields, max_results=max_results, pagination_token=pagination_token, ) if expansions: params["expansions"] = "pinned_tweet_id" for page in self.get_paginated(url, params=params): if "data" in page: yield page else: log.info( f"Retrieved an empty page of results for liking_users of {tweet_id}" ) def liked_tweets( self, user_id, max_results=100, expansions=None, tweet_fields=None, user_fields=None, media_fields=None, poll_fields=None, place_fields=None, pagination_token=None, ): """ Retrieve the tweets liked by the given user_id. """ user_id = self._ensure_user_id(user_id) url = f"https://api.twitter.com/2/users/{user_id}/liked_tweets" params = self._prepare_params( max_results=100, expansions=None, tweet_fields=None, user_fields=None, media_fields=None, poll_fields=None, place_fields=None, pagination_token=None, ) for page in self.get_paginated(url, params=params): if "data" in page: yield page else: log.info( f"Retrieved an empty page of results for liked_tweets of {user_id}" ) def retweeted_by( self, tweet_id, expansions=None, tweet_fields=None, user_fields=None, max_results=100, pagination_token=None, ): """ Retrieve the user profiles of accounts that have retweeted the given tweet. """ url = f"https://api.twitter.com/2/tweets/{tweet_id}/retweeted_by" params = self._prepare_params( tweet_fields=tweet_fields, user_fields=user_fields, max_results=max_results, pagination_token=pagination_token, ) if expansions: params["expansions"] = "pinned_tweet_id" for page in self.get_paginated(url, params=params): if "data" in page: yield page else: log.info( f"Retrieved an empty page of results for retweeted_by of {tweet_id}" ) def quotes( self, tweet_id, expansions=None, tweet_fields=None, user_fields=None, max_results=100, pagination_token=None, ): """ Retrieve the tweets that quote tweet the given tweet. """ url = f"https://api.twitter.com/2/tweets/{tweet_id}/quote_tweets" params = self._prepare_params( expansions=expansions, tweet_fields=tweet_fields, user_fields=user_fields, max_results=max_results, pagination_token=pagination_token, ) for page in self.get_paginated(url, params=params): if "data" in page: yield page else: log.info(f"Retrieved an empty page of results for quotes of {tweet_id}") @catch_request_exceptions @rate_limit def get(self, *args, **kwargs): """ Make a GET request to a specified URL. Args: *args: Variable length argument list. **kwargs: Arbitrary keyword arguments. Returns: requests.Response: Response from Twitter API. """ if not self.client: self.connect() log.info("getting %s %s", args, kwargs) r = self.last_response = self.client.get(*args, timeout=(3.05, 31), **kwargs) return r def get_paginated(self, *args, **kwargs): """ A wrapper around the `get` method that handles Twitter token based pagination. Yields one page (one API response) at a time. Args: *args: Variable length argument list. **kwargs: Arbitrary keyword arguments. Returns: generator[dict]: A generator, dict for each page of results. """ resp = self.get(*args, **kwargs) page = resp.json() url = args[0] if self.metadata: page = _append_metadata(page, resp.url) yield page # Todo: Maybe this should be backwards.. check for `next_token` endings = [ "mentions", "tweets", "following", "followers", "liked_tweets", "liking_users", "retweeted_by", "members", "memberships", "followed_lists", "owned_lists", "pinned_lists", ] # The search endpoints only take a next_token, but the timeline # endpoints take a pagination_token instead - this is a bit of a hack, # but check the URL ending to see which we should use. if any(url.endswith(end) for end in endings): token_param = "pagination_token" else: token_param = "next_token" while "meta" in page and "next_token" in page["meta"]: if "params" in kwargs: kwargs["params"][token_param] = page["meta"]["next_token"] else: kwargs["params"] = {token_param: page["meta"]["next_token"]} resp = self.get(*args, **kwargs) page = resp.json() if self.metadata: page = _append_metadata(page, resp.url) yield page @catch_request_exceptions @rate_limit def post(self, url, json_data): """ Make a POST request to the specified URL. Args: url (str): URL to make a POST request json_data (dict): JSON data to send. Returns: requests.Response: Response from Twitter API. """ if not self.client: self.connect() return self.client.post(url, json=json_data) def connect(self): """ Sets up the HTTP session to talk to Twitter. If one is active it is closed and another one is opened. """ if self.last_response: self.last_response.close() if self.client: self.client.close() if self.auth_type == "application" and self.bearer_token: log.info("creating HTTP session headers for app auth.") auth = f"Bearer {self.bearer_token}" log.debug("authorization: %s", auth) self.client = requests.Session() self.client.headers.update({"Authorization": auth}) elif self.auth_type == "application": log.info("creating app auth client via OAuth2") log.debug("client_id: %s", self.consumer_key) log.debug("client_secret: %s", self.consumer_secret) client = BackendApplicationClient(client_id=self.consumer_key) self.client = OAuth2Session(client=client) self.client.fetch_token( token_url="https://api.twitter.com/oauth2/token", client_id=self.consumer_key, client_secret=self.consumer_secret, ) else: log.info("creating user auth client") log.debug("client_id: %s", self.consumer_key) log.debug("client_secret: %s", self.consumer_secret) log.debug("resource_owner_key: %s", self.access_token) log.debug("resource_owner_secret: %s", self.access_token_secret) self.client = OAuth1Session( client_key=self.consumer_key, client_secret=self.consumer_secret, resource_owner_key=self.access_token, resource_owner_secret=self.access_token_secret, ) if self.client: self.client.headers.update({"User-Agent": user_agent}) @requires_app_auth def compliance_job_list(self, job_type, status): """ Returns list of compliance jobs. Calls [GET /2/compliance/jobs](https://developer.twitter.com/en/docs/twitter-api/compliance/batch-compliance/api-reference/get-compliance-jobs) Args: job_type (str): Filter by job type - either tweets or users. status (str): Filter by job status. Only one of 'created', 'in_progress', 'complete', 'failed' can be specified. If not set, returns all. Returns: list[dict]: A list of jobs. """ params = {} if job_type: params["type"] = job_type if status: params["status"] = status result = self.client.get( "https://api.twitter.com/2/compliance/jobs", params=params ).json() if "data" in result or not result: return result else: raise ValueError(f"Unknown response from twitter: {result}") @requires_app_auth def compliance_job_get(self, job_id): """ Returns a compliance job. Calls [GET /2/compliance/jobs/{job_id}](https://developer.twitter.com/en/docs/twitter-api/compliance/batch-compliance/api-reference/get-compliance-jobs-id) Args: job_id (int): The ID of the compliance job. Returns: dict: A compliance job. """ result = self.client.get( "https://api.twitter.com/2/compliance/jobs/{}".format(job_id) ) if result.status_code == 200: result = result.json() else: raise ValueError(f"Error from API, response: {result.status_code}") if "data" in result: return result else: raise ValueError(f"Unknown response from twitter: {result}") @requires_app_auth def compliance_job_create(self, job_type, job_name, resumable=False): """ Creates a new compliace job. Calls [POST /2/compliance/jobs](https://developer.twitter.com/en/docs/twitter-api/compliance/batch-compliance/api-reference/post-compliance-jobs) Args: job_type (str): The type of job to create. Either 'tweets' or 'users'. job_name (str): Optional name for the job. resumable (bool): Whether or not the job upload is resumable. """ payload = {} payload["type"] = job_type payload["resumable"] = resumable if job_name: payload["name"] = job_name result = self.client.post( "https://api.twitter.com/2/compliance/jobs", json=payload ) if result.status_code == 200: result = result.json() else: raise ValueError(f"Error from API, response: {result.status_code}") if "data" in result: return result else: raise ValueError(f"Unknown response from twitter: {result}") def geo( self, lat=None, lon=None, query=None, ip=None, granularity="neighborhood", max_results=None, ): """ Gets geographic places that can be useful in queries. This is a v1.1 endpoint but is useful in querying the v2 API. Calls [1.1/geo/search.json](https://api.twitter.com/1.1/geo/search.json) Args: lat (float): latitude to search around lon (float): longitude to search around query (str): text to match in the place name ip (str): use the ip address to locate places granularity (str) : neighborhood, city, admin, country max_results (int): maximum results to return """ params = {} if lat and lon: params["lat"] = lat params["long"] = lon elif query: params["query"] = query elif ip: params["ip"] = ip else: raise ValueError("geo() needs either lat/lon, query or ip)") if granularity not in ["neighborhood", "city", "admin", "country"]: raise ValueError( "{granularity} is not valid value for granularity, please use neighborhood, city, admin or country" ) params["granularity"] = granularity if max_results and type(max_results) != int: raise ValueError("max_results must be an int") params["max_results"] = max_results url = "https://api.twitter.com/1.1/geo/search.json" result = self.get(url, params=params) if result.status_code == 200: result = result.json() else: raise ValueError(f"Error from API, response: {result.status_code}") return result def _id_exists(self, user): """ Returns True if the user id exists """ try: error_name = next(self.user_lookup([user]))["errors"][0]["title"] return error_name != "Not Found Error" except KeyError: return True def _ensure_user_id(self, user): """ Always return a valid user id, look up if not numeric. """ user = str(user) is_numeric = re.match(r"^\d+$", user) if len(user) > 15 or (is_numeric and self._id_exists(user)): return user else: results = next(self.user_lookup([user], usernames=True)) if "data" in results and len(results["data"]) > 0: return results["data"][0]["id"] elif is_numeric: return user else: raise ValueError(f"No such user {user}") def _ensure_user(self, user): """ Always return a valid user object. """ user = str(user) is_numeric = re.match(r"^\d+$", user) lookup = [] if len(user) > 15 or (is_numeric and self._id_exists(user)): lookup = list(self.user_lookup([user]))[0] else: lookup = list(self.user_lookup([user], usernames=True))[0] if "data" in lookup: return lookup["data"][0] else: raise ValueError(f"No such user {user}") def _check_for_disconnect(self, data): """ Look for disconnect errors in a response, and reconnect if found. The function returns True if a disconnect was found and False otherwise. """ for error in data.get("errors", []): if error.get("disconnect_type") == "OperationalDisconnect": log.info("Received operational disconnect message, reconnecting") self.connect() return True return False def _ts(dt): """ Return ISO 8601 / RFC 3339 datetime in UTC. If no timezone is specified it is assumed to be in UTC. The Twitter API does not accept microseconds. Args: dt (datetime): a `datetime` object to format. Returns: str: an ISO 8601 / RFC 3339 datetime in UTC. """ if dt.tzinfo: dt = dt.astimezone(datetime.timezone.utc) else: dt = dt.replace(tzinfo=datetime.timezone.utc) return dt.isoformat(timespec="seconds") def _utcnow(): """ Return _now_ in ISO 8601 / RFC 3339 datetime in UTC. Returns: datetime: Current timestamp in UTC. """ return datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds") def _append_metadata(result, url): """ Appends `__twarc` metadata to the result. Adds the full URL with parameters used, the version and current timestamp in seconds. Args: result (dict): API Response to append data to. url (str): URL of the API endpoint called. Returns: dict: API Response with append metadata """ result["__twarc"] = {"url": url, "version": version, "retrieved_at": _utcnow()} return result ================================================ FILE: src/twarc/command.py ================================================ from __future__ import print_function import os import re import sys import json import signal import codecs import logging import datetime import argparse import fileinput from twarc.client import Twarc from twarc.version import version from twarc.json2csv import csv, get_headings, get_row from dateutil.parser import parse as parse_dt if sys.version_info[:2] <= (2, 7): # Python 2 pyv = 2 get_input = raw_input str_type = unicode import ConfigParser as configparser else: # Python 3 pyv = 3 get_input = input str_type = str import configparser log = logging.getLogger("twarc") commands = [ "configure", "dehydrate", "filter", "followers", "friends", "help", "hydrate", "replies", "retweets", "sample", "search", "timeline", "trends", "tweet", "users", "listmembers", "version", ] def main(): parser = get_argparser() args = parser.parse_args() command = args.command query = args.query or "" logging.basicConfig( filename=args.log, level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", ) # log and stop when process receives SIGINT def stop(signal, frame): log.warn("process received SIGNT, stopping") sys.exit(0) signal.signal(signal.SIGINT, stop) if command == "version": print("twarc v%s" % version) sys.exit() elif command == "help" or not command: parser.print_help() print("\nPlease use one of the following commands:\n") for cmd in commands: print(" - %s" % cmd) print("\nFor example:\n\n twarc search blacklivesmatter") sys.exit(1) # Don't validate the keys if the command is "configure" if command == "configure" or args.skip_key_validation: validate_keys = False else: validate_keys = True t = Twarc( consumer_key=args.consumer_key, consumer_secret=args.consumer_secret, access_token=args.access_token, access_token_secret=args.access_token_secret, connection_errors=args.connection_errors, http_errors=args.http_errors, config=args.config, profile=args.profile, tweet_mode=args.tweet_mode, protected=args.protected, validate_keys=validate_keys, app_auth=args.app_auth, gnip_auth=args.gnip_auth, ) # calls that return tweets if command == "search": if len(args.lang) > 0: lang = args.lang[0] else: lang = None # if not using a premium endpoint do a standard search if not args.thirtyday and not args.fullarchive and not args.gnip_fullarchive: things = t.search( query, since_id=args.since_id, max_id=args.max_id, lang=lang, result_type=args.result_type, geocode=args.geocode, ) else: # parse the dates if given from_date = parse_dt(args.from_date) if args.from_date else None to_date = parse_dt(args.to_date) if args.to_date else None if args.gnip_fullarchive: env = args.gnip_fullarchive product = "gnip_fullarchive" elif args.thirtyday: env = args.thirtyday product = "30day" else: env = args.fullarchive product = "fullarchive" things = t.premium_search( query, product, env, from_date=from_date, to_date=to_date, sandbox=args.sandbox, limit=args.limit, ) elif command == "filter": things = t.filter( track=query, follow=args.follow, locations=args.locations, lang=args.lang ) elif command == "dehydrate": input_iterator = fileinput.FileInput( query, mode="r", openhook=fileinput.hook_compressed, ) things = t.dehydrate(input_iterator) elif command == "hydrate": input_iterator = fileinput.FileInput( query, mode="r", openhook=fileinput.hook_compressed, ) things = t.hydrate(input_iterator) elif command == "tweet": things = [t.tweet(query)] elif command == "sample": things = t.sample() elif command == "timeline": kwargs = {"max_id": args.max_id, "since_id": args.since_id} if re.match("^[0-9]+$", query): kwargs["user_id"] = query elif query: kwargs["screen_name"] = query things = t.timeline(**kwargs) elif command == "retweets": if os.path.isfile(query): iterator = fileinput.FileInput( query, mode="r", openhook=fileinput.hook_compressed, ) things = t.retweets(tweet_ids=iterator) else: things = t.retweets(tweet_ids=query.split(",")) elif command == "users": if os.path.isfile(query): iterator = fileinput.FileInput( query, mode="r", openhook=fileinput.hook_compressed, ) if re.match("^[0-9,]+$", next(open(query))): id_type = "user_id" else: id_type = "screen_name" things = t.user_lookup(ids=iterator, id_type=id_type) elif re.match("^[0-9,]+$", query): things = t.user_lookup(ids=query.split(",")) else: things = t.user_lookup(ids=query.split(","), id_type="screen_name") elif command == "followers": things = t.follower_ids(query) elif command == "friends": things = t.friend_ids(query) elif command == "trends": # lookup woeid for geo-coordinate if appropriate geo = re.match("^([0-9-.]+),([0-9-.]+)$", query) if geo: lat, lon = map(float, geo.groups()) if lat > 180 or lat < -180 or lon > 180 or lon < -180: parser.error("LAT and LONG must be within [-180.0, 180.0]") places = list(t.trends_closest(lat, lon)) if len(places) == 0: parser.error("Couldn't find WOE ID for %s" % query) query = places[0]["woeid"] if not query: things = t.trends_available() else: trends = t.trends_place(query) if trends: things = trends[0]["trends"] elif command == "replies": tweet = t.tweet(query) if not tweet: parser.error("tweet with id %s does not exist" % query) things = t.replies(tweet, args.recursive) elif command == "listmembers": list_parts = re.match("^https://twitter.com/(.+)/lists/(.+)$", query) if not list_parts: parser.error( "provide the url for the list, e.g., https://twitter.com/USAFacts/lists/us-armed-forces" ) things = t.list_members( slug=list_parts.group(2), owner_screen_name=list_parts.groups(1) ) elif command == "configure": t.configure() sys.exit() else: parser.print_help() print("\nPlease use one of the following commands:\n") for cmd in commands: print(" - %s" % cmd) print("\nFor example:\n\n twarc search blacklivesmatter") sys.exit(1) # get the output filehandle if args.output: if pyv == 3: fh = codecs.open(args.output, "wb", "utf8") else: fh = open(args.output, "w") else: fh = sys.stdout # optionally create a csv writer csv_writer = None if args.format in ("csv", "csv-excel") and command not in [ "filter", "hydrate", "replies", "retweets", "sample", "search", "timeline", "tweet", ]: parser.error("csv output not available for %s" % command) elif args.format in ("csv", "csv-excel"): csv_writer = csv.writer(fh) csv_writer.writerow(get_headings()) line_count = 0 file_count = 0 for thing in things: # rotate the files if necessary if args.output and args.split and line_count % args.split == 0: file_count += 1 fh = codecs.open(numbered_filepath(args.output, file_count), "wb", "utf8") if csv_writer: csv_writer = csv.writer(fh) csv_writer.writerow(get_headings()) line_count += 1 # ready to output kind_of = type(thing) if kind_of == str_type: # user or tweet IDs print(thing, file=fh) log.info("archived %s" % thing) elif "id_str" in thing: # tweets and users if args.format == "json": print(json.dumps(thing), file=fh) elif args.format == "csv": csv_writer.writerow(get_row(thing)) elif args.format == "csv-excel": csv_writer.writerow(get_row(thing, excel=True)) log.info("archived %s", thing["id_str"]) elif "woeid" in thing: # places print(json.dumps(thing), file=fh) elif "tweet_volume" in thing: # trends print(json.dumps(thing), file=fh) elif "limit" in thing: # rate limits t = datetime.datetime.utcfromtimestamp( float(thing["limit"]["timestamp_ms"]) / 1000 ) t = t.isoformat("T") + "Z" log.warning("%s tweets undelivered at %s", thing["limit"]["track"], t) if args.warnings: print(json.dumps(thing), file=fh) elif "warning" in thing: # other warnings log.warning(thing["warning"]["message"]) if args.warnings: print(json.dumps(thing), file=fh) elif "data" in thing: # Labs style JSON schema. print(json.dumps(thing), file=fh) def get_argparser(): """ Get the command line argument parser. """ parser = argparse.ArgumentParser("twarc") parser.add_argument("command", choices=commands) parser.add_argument("query", nargs="?", default=None) parser.add_argument("--log", dest="log", default="twarc.log", help="log file") parser.add_argument("--consumer_key", default=None, help="Twitter API consumer key") parser.add_argument( "--consumer_secret", default=None, help="Twitter API consumer secret" ) parser.add_argument("--access_token", default=None, help="Twitter API access key") parser.add_argument( "--access_token_secret", default=None, help="Twitter API access token secret" ) parser.add_argument( "--config", help="Config file containing Twitter keys and secrets" ) parser.add_argument( "--profile", help="Name of a profile in your configuration file" ) parser.add_argument( "--warnings", action="store_true", help="Include warning messages in output" ) parser.add_argument( "--connection_errors", type=int, default="0", help="Number of connection errors before giving up", ) parser.add_argument( "--http_errors", type=int, default="0", help="Number of http errors before giving up", ) parser.add_argument( "--max_id", dest="max_id", help="maximum tweet id to search for" ) parser.add_argument("--since_id", dest="since_id", help="smallest id to search for") parser.add_argument( "--result_type", dest="result_type", choices=["mixed", "recent", "popular"], default="recent", help="search result type", ) parser.add_argument( "--lang", dest="lang", action="append", default=[], help="limit to ISO 639-1 language code", ), parser.add_argument( "--geocode", dest="geocode", help="limit by latitude,longitude,radius" ) parser.add_argument( "--locations", dest="locations", help="limit filter stream to location(s)" ) parser.add_argument( "--follow", dest="follow", help="limit filter to tweets from given user id(s)" ) parser.add_argument( "--recursive", dest="recursive", action="store_true", help="also fetch replies to replies", ) parser.add_argument( "--tweet_mode", action="store", default="extended", dest="tweet_mode", choices=["compat", "extended"], help="set tweet mode", ) parser.add_argument( "--protected", dest="protected", action="store_true", help="include protected tweets", ) parser.add_argument( "--output", action="store", default=None, dest="output", help="write output to file path", ) parser.add_argument( "--format", action="store", default="json", dest="format", choices=["json", "csv", "csv-excel"], help="set output format", ) parser.add_argument( "--split", action="store", type=int, default=0, help="used with --output to split into numbered files", ) parser.add_argument( "--skip_key_validation", action="store_true", help="skip checking keys are valid on startup", ) parser.add_argument( "--app_auth", action="store_true", default=False, help="run in App Auth mode instead of User Auth", ) parser.add_argument( "--gnip_auth", action="store_true", default=False, help="run in Gnip Auth mode (for enterprise APIs)", ) parser.add_argument( "--30day", action="store", dest="thirtyday", help="environment to use to search 30day premium endpoint", ) parser.add_argument( "--fullarchive", action="store", help="environment to use to search fullarchive premium endpoint", ), parser.add_argument( "--gnip_fullarchive", action="store", help="environment to use to search gnip fullarchive enterprise endpoint", ), parser.add_argument( "--from_date", action="store", default=None, help="limit premium search to date e.g. 2012-05-01 03:04:01", ) parser.add_argument( "--to_date", action="store", default=None, help="limit premium search to date e.g. 2012-05-01 03:04:01", ) parser.add_argument( "--limit", type=int, default=0, help="limit number of tweets returned by Premium API", ) parser.add_argument( "--sandbox", action="store_true", default=False, help="indicate that Premium API endpoint is a sandbox", ) return parser def numbered_filepath(filepath, num): path, ext = os.path.splitext(filepath) return os.path.join("{}-{:0>3}{}".format(path, num, ext)) ================================================ FILE: src/twarc/command2.py ================================================ """ The command line interfact to the Twitter v2 API. """ import os import re import json import time import twarc import click import logging import pathlib import datetime import humanize import requests import configobj import threading from tqdm.auto import tqdm from tqdm.utils import CallbackIOWrapper from datetime import timezone from click_plugins import with_plugins from importlib.metadata import entry_points from twarc.version import version from twarc.handshake import handshake from twarc.config import ConfigProvider from twarc.expansions import ( ensure_flattened, EXPANSIONS, TWEET_FIELDS, USER_FIELDS, MEDIA_FIELDS, POLL_FIELDS, PLACE_FIELDS, LIST_FIELDS, ) from click import Option, UsageError from click_config_file import configuration_option from twarc.decorators2 import ( cli_api_error, TimestampProgressBar, FileSizeProgressBar, FileLineProgressBar, _millis2snowflake, _date2millis, ) config_provider = ConfigProvider() log = logging.getLogger("twarc") @with_plugins(entry_points(group="twarc.plugins")) @click.group() @click.option( "--consumer-key", type=str, envvar="CONSUMER_KEY", help='Twitter app consumer key (aka "App Key")', ) @click.option( "--consumer-secret", type=str, envvar="CONSUMER_SECRET", help='Twitter app consumer secret (aka "App Secret")', ) @click.option( "--access-token", type=str, envvar="ACCESS_TOKEN", help="Twitter app access token for user authentication.", ) @click.option( "--access-token-secret", type=str, envvar="ACCESS_TOKEN_SECRET", help="Twitter app access token secret for user authentication.", ) @click.option( "--bearer-token", type=str, envvar="BEARER_TOKEN", help="Twitter app access bearer token.", ) @click.option( "--app-auth/--user-auth", default=True, help="Use application authentication or user authentication. Some rate limits are " "higher with user authentication, but not all endpoints are supported.", show_default=True, ) @click.option("--log", "-l", "log_file", default="twarc.log") @click.option("--verbose", is_flag=True, default=False) @click.option( "--metadata/--no-metadata", default=True, show_default=True, help="Include/don't include metadata about when and how data was collected.", ) @configuration_option( cmd_name="twarc", config_file_name="config", provider=config_provider ) @click.pass_context def twarc2( ctx, consumer_key, consumer_secret, access_token, access_token_secret, bearer_token, log_file, metadata, app_auth, verbose, ): """ Collect data from the Twitter V2 API. """ logging.basicConfig( filename=log_file, level=logging.DEBUG if verbose else logging.INFO, format="%(asctime)s %(levelname)s %(message)s", ) log.info("using config %s", config_provider.file_path) if bearer_token or (consumer_key and consumer_secret): if app_auth and (bearer_token or (consumer_key and consumer_secret)): ctx.obj = twarc.Twarc2( consumer_key=consumer_key, consumer_secret=consumer_secret, bearer_token=bearer_token, metadata=metadata, ) # Check everything is present for user auth. elif consumer_key and consumer_secret and access_token and access_token_secret: ctx.obj = twarc.Twarc2( consumer_key=consumer_key, consumer_secret=consumer_secret, access_token=access_token, access_token_secret=access_token_secret, metadata=metadata, ) else: click.echo( click.style( "🙃 To use user authentication, you need all of the following:\n" "- consumer_key\n", "- consumer_secret\n", "- access_token\n", "- access_token_secret\n", fg="red", ), err=True, ) click.echo("You can configure twarc2 using the `twarc2 configure` command.") else: click.echo() click.echo("👋 Hi I don't see a configuration file yet, so let's make one.") click.echo() click.echo("Please follow these steps:") click.echo() click.echo("1. visit https://developer.twitter.com/en/portal/") click.echo("2. create a project and an app") click.echo("3. go to your Keys and Tokens and generate your keys") click.echo() ctx.invoke(configure) @twarc2.command("configure") @click.pass_context def configure(ctx): """ Set up your Twitter app keys. """ config_file = config_provider.file_path log.info("creating config file: %s", config_file) config_dir = pathlib.Path(config_file).parent if not config_dir.is_dir(): log.info("creating config directory: %s", config_dir) config_dir.mkdir(parents=True) keys = handshake() if keys is None: raise click.ClickException("Unable to authenticate") config = configobj.ConfigObj(unrepr=True) config.filename = config_file # Only write non empty keys. for key in [ "consumer_key", "consumer_secret", "access_token", "access_token_secret", "bearer_token", ]: if keys.get(key, None): config[key] = keys[key] config.write() click.echo( click.style(f"\nYour keys have been written to {config_file}", fg="green") ) click.echo() click.echo("\n✨ ✨ ✨ Happy twarcing! ✨ ✨ ✨\n") ctx.exit() @twarc2.command("version") def get_version(): """ Return the version of twarc that is installed. """ click.echo(f"twarc v{version}") def _search( T, query, outfile, since_id, until_id, start_time, end_time, limit, max_results, archive, hide_progress, expansions, tweet_fields, user_fields, media_fields, poll_fields, place_fields, sort_order, ): """ Common function to Search for tweets. """ count = 0 # Make sure times are always in UTC, click sometimes doesn't add timezone: if start_time is not None and start_time.tzinfo is None: start_time = start_time.replace(tzinfo=timezone.utc) if end_time is not None and end_time.tzinfo is None: end_time = end_time.replace(tzinfo=timezone.utc) if archive: search_method = T.search_all # start time defaults to the beginning of Twitter to override the # default of the last month. Only do this if start_time is not already # specified and since_id and until_id aren't being used if start_time is None and since_id is None and until_id is None: start_time = datetime.datetime(2006, 3, 21, tzinfo=datetime.timezone.utc) else: search_method = T.search_recent hide_progress = True if (outfile.name == "") else hide_progress with TimestampProgressBar( since_id, until_id, start_time, end_time, disable=hide_progress ) as progress: for result in search_method( query=query, since_id=since_id, until_id=until_id, start_time=start_time, end_time=end_time, max_results=max_results, expansions=expansions, tweet_fields=tweet_fields, user_fields=user_fields, media_fields=media_fields, poll_fields=poll_fields, place_fields=place_fields, sort_order=sort_order, ): _write(result, outfile) tweet_ids = [t["id"] for t in result.get("data", [])] log.info("archived %s", ",".join(tweet_ids)) progress.update_with_result(result) count += len(result["data"]) if limit != 0 and count >= limit: # Display message when stopped early progress.desc = f"Set --limit of {limit} reached" break else: progress.early_stop = False class MutuallyExclusiveOption(Option): """ Custom click class to make some options mutually exclusive via https://gist.github.com/jacobtolar/fb80d5552a9a9dfc32b12a829fa21c0c """ def __init__(self, *args, **kwargs): self.mutually_exclusive = set(kwargs.pop("mutually_exclusive", [])) help = kwargs.get("help", "") if self.mutually_exclusive: ex_str = ", ".join( self.parse_name(name) for name in self.mutually_exclusive ) kwargs["help"] = help + ( " NOTE: This argument is mutually exclusive with " " arguments: [" + ex_str + "]." ) super(MutuallyExclusiveOption, self).__init__(*args, **kwargs) def parse_name(self, name): return f'--{name.replace("_","-")}' def handle_parse_result(self, ctx, opts, args): if self.mutually_exclusive.intersection(opts) and self.name in opts: raise UsageError( f"Incorrect usage: {self.parse_name(self.name)} is mutually exclusive with " f"arguments `{', '.join(self.parse_name(name) for name in self.mutually_exclusive)} use either one or the other." ) return super(MutuallyExclusiveOption, self).handle_parse_result(ctx, opts, args) def command_line_input_output_file_arguments(f): """ Decorator for specifying input and output file arguments in a command """ f = click.argument("outfile", type=click.File("w"), default="-")(f) f = click.argument("infile", type=click.File("r"), default="-")(f) return f def command_line_progressbar_option(f): """ Decorator for specifying a progress bar option. """ f = click.option( "--hide-progress", is_flag=True, default=False, help="Hide the Progress bar. Default: show progress, unless using pipes.", )(f) return f def command_line_search_options(f): """ Decorator for specifying time range search API parameters. """ f = click.option( "--until-id", type=int, help="Match tweets sent prior to tweet id" )(f) f = click.option("--since-id", type=int, help="Match tweets sent after tweet id")(f) f = click.option( "--end-time", type=click.DateTime(formats=("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S")), help='Match tweets sent before UTC time (ISO 8601/RFC 3339), \n e.g. --end-time "2021-01-01T12:31:04"', )(f) f = click.option( "--start-time", type=click.DateTime(formats=("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S")), help='Match tweets created after UTC time (ISO 8601/RFC 3339), \n e.g. --start-time "2021-01-01T12:31:04"', )(f) return f def command_line_timelines_options(f): """ Decorator for common timelines command line options """ f = click.option( "--exclude-replies", is_flag=True, default=False, help="Exclude replies from timeline", )(f) f = click.option( "--exclude-retweets", is_flag=True, default=False, help="Exclude retweets from timeline", )(f) f = click.option( "--use-search", is_flag=True, default=False, help="Use the search/all API endpoint which is not limited to the last 3200 tweets, but requires Academic Product Track access.", )(f) return f def _validate_max_results(context, parameter, value): """ Validate and set appropriate max_results parameter. """ archive_set = "archive" in context.params and context.params["archive"] no_context_annotations_set = ( "no_context_annotations" in context.params and context.params["no_context_annotations"] ) minimal_fields_set = ( "minimal_fields" in context.params and context.params["minimal_fields"] ) has_context_annotations = ( "tweet_fields" in context.params and "context_annotations" in context.params["tweet_fields"].split(",") ) if value: if not archive_set and value > 100: raise click.BadParameter( "--max-results cannot be greater than 100 when using Standard Access. Specify --archive if you have Academic Access." ) if value < 10 or value > 500: raise click.BadParameter("--max-results must be between 10 and 500") if value > 100 and (has_context_annotations and not no_context_annotations_set): raise click.BadParameter( "--max-results cannot be greater than 100 when using context annotations. Set --no-context-annotations to remove them, or don't specify them in --tweet-fields." ) return value else: if archive_set and ( no_context_annotations_set or minimal_fields_set or not has_context_annotations ): return 500 return 100 def command_line_search_archive_options(f): """ Decorator for specifying additional search API parameters. """ f = click.option( "--limit", default=0, help="Maximum number of tweets to save", type=int )(f) f = click.option( "--max-results", default=None, help="Maximum number of tweets per API response", callback=_validate_max_results, type=int, )(f) f = click.option( "--archive", is_flag=True, default=False, is_eager=True, help="Use the full archive (requires Academic Research track)", )(f) return f def _validate_expansions(context, parameter, value): """ Validate passed comma separated values for expansions. """ if value: values = value.split(",") valid = parameter.default.split(",") for v in values: if v not in valid: raise click.BadOptionUsage( parameter.name, f'"{v}" is not a valid entry for --{parameter.name}. Must be a comma separated string, without spaces, like this:\n--{parameter.name} "{parameter.default}"', ) return ",".join(values) def command_line_expansions_options(f): """ Decorator for specifying custom fields and expansions """ f = click.option( "--poll-fields", default=",".join(POLL_FIELDS), type=click.STRING, help="Comma separated list of poll fields to retrieve. Default is all available.", callback=_validate_expansions, )(f) f = click.option( "--place-fields", default=",".join(PLACE_FIELDS), type=click.STRING, help="Comma separated list of place fields to retrieve. Default is all available.", callback=_validate_expansions, )(f) f = click.option( "--media-fields", default=",".join(MEDIA_FIELDS), type=click.STRING, help="Comma separated list of media fields to retrieve. Default is all available.", callback=_validate_expansions, )(f) f = click.option( "--user-fields", default=",".join(USER_FIELDS), type=click.STRING, help="Comma separated list of user fields to retrieve. Default is all available.", callback=_validate_expansions, )(f) f = click.option( "--tweet-fields", default=",".join(TWEET_FIELDS), type=click.STRING, is_eager=True, help="Comma separated list of tweet fields to retrieve. Default is all available.", callback=_validate_expansions, )(f) f = click.option( "--expansions", default=",".join(EXPANSIONS), type=click.STRING, help="Comma separated list of expansions to retrieve. Default is all available.", callback=_validate_expansions, )(f) return f def command_line_expansions_shortcuts(f): """ Decorator for specifying common fields and expansions presets """ f = click.option( "--minimal-fields", cls=MutuallyExclusiveOption, mutually_exclusive=[ "no_context_annotations", "expansions", "tweet_fields", "user_fields", "media_fields", "poll_fields", "place_fields", "counts_only", ], is_flag=True, default=False, is_eager=True, help="By default twarc gets all available data. This option requests the minimal retrievable amount of data - only IDs and object references are retrieved. Setting this makes --max-results 500 the default.", )(f) f = click.option( "--no-context-annotations", cls=MutuallyExclusiveOption, mutually_exclusive=[ "minimal_fields", "expansions", "tweet_fields", "user_fields", "media_fields", "poll_fields", "place_fields", "counts_only", ], is_flag=True, default=False, is_eager=True, help="By default twarc gets all available data. This leaves out context annotations (Twitter API limits --max-results to 100 if these are requested). Setting this makes --max-results 500 the default.", )(f) return f def _process_expansions_shortcuts(kwargs): # Override fields and expansions if kwargs.pop("minimal_fields", None): kwargs[ "expansions" ] = "author_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id,attachments.poll_ids,attachments.media_keys,geo.place_id" kwargs[ "tweet_fields" ] = "id,conversation_id,author_id,in_reply_to_user_id,referenced_tweets,geo" kwargs[ "user_fields" ] = "id,username,name,pinned_tweet_id" # pinned_tweet_id is the only extra one, id,username,name are always returned. kwargs["media_fields"] = "media_key" kwargs["poll_fields"] = "id" kwargs["place_fields"] = "id" if kwargs.pop("no_context_annotations", None): kwargs[ "tweet_fields" ] = "attachments,author_id,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,text,possibly_sensitive,referenced_tweets,reply_settings,source,withheld" return kwargs def command_line_verbose_options(f): """ Decorator for specifying verbose and json output """ f = click.option( "--verbose", is_flag=True, default=False, help="Show all URLs and metadata.", )(f) f = click.option( "--json-output", is_flag=True, default=False, help="Return the raw json content from the API.", )(f) return f @twarc2.command("search") @click.option( "--sort-order", type=click.Choice(["recency", "relevancy"]), help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")', ) @command_line_search_options @command_line_search_archive_options @command_line_expansions_shortcuts @command_line_expansions_options @command_line_progressbar_option @click.argument("query", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def search( T, query, outfile, **kwargs, ): """ Search for tweets. For help on how to write a query see https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query """ kwargs = _process_expansions_shortcuts(kwargs) return _search( T, query, outfile, **kwargs, ) @twarc2.command("counts") @command_line_search_options @click.option( "--archive", is_flag=True, default=False, help="Count using the full archive (requires Academic Research track)", ) @click.option( "--granularity", default="hour", type=click.Choice(["day", "hour", "minute"], case_sensitive=False), help="Aggregation level for counts. Can be one of: day, hour, minute. Default is hour.", ) @click.option( "--limit", default=0, help="Maximum number of days of results to save (minimum is 30 days)", ) @click.option( "--text", is_flag=True, default=False, help="Output the counts as human readable text", ) @click.option("--csv", is_flag=True, default=False, help="Output counts as CSV") @command_line_progressbar_option @click.argument("query", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def counts( T, query, outfile, since_id, until_id, start_time, end_time, archive, granularity, limit, text, csv, hide_progress, ): """ Return counts of tweets matching a query. """ count = 0 # Make sure times are always in UTC, click sometimes doesn't add timezone: if start_time is not None and start_time.tzinfo is None: start_time = start_time.replace(tzinfo=timezone.utc) if end_time is not None and end_time.tzinfo is None: end_time = end_time.replace(tzinfo=timezone.utc) if archive: count_method = T.counts_all # start time defaults to the beginning of Twitter to override the # default of the last month. Only do this if start_time is not already # specified and since_id/until_id aren't being used if start_time is None and since_id is None and until_id is None: start_time = datetime.datetime(2006, 3, 21, tzinfo=datetime.timezone.utc) else: count_method = T.counts_recent if csv: click.echo(f"start,end,{granularity}_count", file=outfile) hide_progress = True if (outfile.name == "") else hide_progress total_tweets = 0 with TimestampProgressBar( since_id, until_id, start_time, end_time, disable=hide_progress ) as progress: for result in count_method( query, since_id, until_id, start_time, end_time, granularity, ): # Count outputs: if text: for r in result["data"]: total_tweets += r["tweet_count"] click.echo( "{start} - {end}: {tweet_count:,}".format(**r), file=outfile ) elif csv: for r in result["data"]: click.echo( f'{r["start"]},{r["end"]},{r["tweet_count"]}', file=outfile ) else: _write(result, outfile) # Progress and limits: if len(result["data"]) > 0: progress.update_with_dates( result["data"][0]["start"], result["data"][-1]["end"] ) progress.tweet_count += result["meta"]["total_tweet_count"] count += len(result["data"]) if limit != 0 and count >= limit: break if text: click.echo( click.style( "\nTotal Tweets: {:,}\n".format(total_tweets), fg="green" ), file=outfile, ) else: progress.early_stop = False @twarc2.command("tweet") @command_line_expansions_shortcuts @command_line_expansions_options @click.option("--pretty", is_flag=True, default=False, help="Pretty print the JSON") @click.argument("tweet_id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def tweet(T, tweet_id, outfile, pretty, **kwargs): """ Look up a tweet using its tweet id or URL. """ kwargs = _process_expansions_shortcuts(kwargs) if "https" in tweet_id: tweet_id = tweet_id.split("/")[-1] if not re.match(r"^\d+$", tweet_id): click.echo(click.style("Please enter a tweet URL or ID", fg="red"), err=True) result = next(T.tweet_lookup([tweet_id], **kwargs)) _write(result, outfile, pretty=pretty) @twarc2.command("followers") @click.option( "--limit", default=0, help="Maximum number of followers to save. Increments of 1000 or --max-results if set.", type=int, ) @click.option( "--max-results", default=1000, help="Maximum number of users per page. Default is 1000.", type=int, ) @command_line_progressbar_option @click.argument("user", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def followers(T, user, outfile, limit, max_results, hide_progress): """ Get the followers for a given user. """ user_id = None lookup_total = 1 hide_progress = True if (outfile.name == "") else hide_progress if not hide_progress: target_user = T._ensure_user(user) user_id = target_user["id"] lookup_total = target_user["public_metrics"]["followers_count"] _write_with_progress( func=T.followers, user=user, user_id=user_id, outfile=outfile, limit=limit, hide_progress=hide_progress, progress_total=lookup_total, max_results=max_results, ) @twarc2.command("following") @click.option( "--limit", default=0, help="Maximum number of friends to save. Increments of 1000 or --max-results if set.", type=int, ) @click.option( "--max-results", default=1000, help="Maximum number of users per page. Default is 1000.", type=int, ) @command_line_progressbar_option @click.argument("user", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def following(T, user, outfile, limit, max_results, hide_progress): """ Get the users that a given user is following. """ user_id = None lookup_total = 1 hide_progress = True if (outfile.name == "") else hide_progress if not hide_progress: target_user = T._ensure_user(user) user_id = target_user["id"] lookup_total = target_user["public_metrics"]["following_count"] _write_with_progress( func=T.following, user=user, user_id=user_id, outfile=outfile, limit=limit, hide_progress=hide_progress, progress_total=lookup_total, max_results=max_results, ) @twarc2.command("liking-users") @click.option( "--limit", default=0, help="Maximum number of liking users to retrieve. Increments of 100 or --max-results if set.", type=int, ) @click.option( "--max-results", default=100, help="Maximum number of users (likes) per page. Default is and maximum is 100.", type=int, ) @command_line_progressbar_option @click.argument("tweet_id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def liking_users(T, tweet_id, outfile, limit, max_results, hide_progress): """ Get the users that liked a specific tweet. Note that the progress bar is approximate. """ lookup_total = 1 if not re.match(r"^\d+$", str(tweet_id)): click.echo(click.style("Please enter a tweet ID", fg="red"), err=True) hide_progress = True if (outfile.name == "") else hide_progress if not hide_progress: # TODO: we could probably do this everytime, and avoid doing any lookups # for tweets that don't exist anymore. target_tweet = list(T.tweet_lookup([tweet_id]))[0] if "data" in target_tweet: lookup_total = target_tweet["data"][0]["public_metrics"]["like_count"] _write_with_progress( func=T.liking_users, tweet_id=tweet_id, outfile=outfile, limit=limit, hide_progress=hide_progress, progress_total=lookup_total, max_results=max_results, ) @twarc2.command("retweeted-by") @click.option( "--limit", default=0, help="Maximum number of retweeting users to retrieve. Increments of 100 or --max-results if set.", type=int, ) @click.option( "--max-results", default=100, help="Maximum number of users (retweets) per page of results. Default and maximum is 100.", type=int, ) @command_line_progressbar_option @click.argument("tweet_id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def retweeted_by(T, tweet_id, outfile, limit, max_results, hide_progress): """ Get the users that retweeted a specific tweet. Note that the progress bar is approximate. """ lookup_total = 0 if not re.match(r"^\d+$", str(tweet_id)): click.echo(click.style("Please enter a tweet ID", fg="red"), err=True) hide_progress = True if (outfile.name == "") else hide_progress if not hide_progress: # TODO: we could probably do this everytime, and avoid doing any lookups # for tweets that don't exist anymore. target_tweet = list(T.tweet_lookup([tweet_id]))[0] if "data" in target_tweet: lookup_total = target_tweet["data"][0]["public_metrics"]["retweet_count"] _write_with_progress( func=T.retweeted_by, tweet_id=tweet_id, outfile=outfile, limit=limit, hide_progress=hide_progress, progress_total=lookup_total, max_results=max_results, ) @twarc2.command("quotes") @click.option( "--limit", default=0, help="Maximum number of retweeting users to retrieve. Increments of 100 or --max-results if set.", type=int, ) @click.option( "--max-results", default=100, help="Maximum number of users (retweets) per page of results. Default and maximum is 100.", type=int, ) @command_line_expansions_shortcuts @command_line_expansions_options @command_line_progressbar_option @click.argument("tweet_id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def quotes(T, tweet_id, outfile, limit, max_results, hide_progress, **kwargs): """ Get the tweets that quote tweet the given tweet. Note that the progress bar is approximate. """ count = 0 lookup_total = 0 kwargs = _process_expansions_shortcuts(kwargs) # Also remove media poll and place from kwargs, these are not valid for this endpoint: kwargs.pop("media_fields", None) kwargs.pop("poll_fields", None) kwargs.pop("place_fields", None) if not re.match(r"^\d+$", str(tweet_id)): click.echo(click.style("Please enter a tweet ID", fg="red"), err=True) hide_progress = True if (outfile.name == "") else hide_progress if not hide_progress: target_tweet = list(T.tweet_lookup([tweet_id]))[0] if "data" in target_tweet: lookup_total = target_tweet["data"][0]["public_metrics"]["quote_count"] _write_with_progress( func=T.quotes, tweet_id=tweet_id, outfile=outfile, limit=limit, hide_progress=hide_progress, progress_total=lookup_total, max_results=max_results, **kwargs, ) @twarc2.command("liked-tweets") @click.option( "--limit", default=0, help="Maximum number of liked tweets to retrieve. Increments of 100 or --max-results if set.", type=int, ) @click.option( "--max-results", default=100, help="Maximum number of liked tweets per page of results. Default and maximum is 100.", type=int, ) @command_line_progressbar_option @click.argument("user_id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def liked_tweets(T, user_id, outfile, limit, max_results, hide_progress): """ Get the tweets liked by a specific user_id. Note that the progress bar is approximate. """ # NB: there doesn't appear to be anyway to get the total count of likes # a user has made, so the progress bar isn't very useful in this case... _write_with_progress( func=T.liked_tweets, user_id=user_id, outfile=outfile, limit=limit, hide_progress=hide_progress, progress_total=1, max_results=max_results, ) @twarc2.command("sample") @command_line_expansions_shortcuts @command_line_expansions_options @click.option("--limit", default=0, help="Maximum number of tweets to save") @click.argument("outfile", type=click.File("a+"), default="-") @click.pass_obj @cli_api_error def sample(T, outfile, limit, **kwargs): """ Fetch tweets from the sample stream. """ kwargs = _process_expansions_shortcuts(kwargs) count = 0 event = threading.Event() click.echo( click.style( f"Started a random sample stream, writing to {outfile.name}\nCTRL+C to stop...", fg="green", ), err=True, ) for result in T.sample(event=event, **kwargs): count += 1 if limit != 0 and count >= limit: event.set() _write(result, outfile) if result and "data" in result: log.info("archived %s", result["data"]["id"]) @twarc2.command("hydrate") @command_line_expansions_shortcuts @command_line_expansions_options @command_line_input_output_file_arguments @command_line_progressbar_option @click.pass_obj @cli_api_error def hydrate(T, infile, outfile, hide_progress, **kwargs): """ Hydrate tweet ids. """ kwargs = _process_expansions_shortcuts(kwargs) with FileLineProgressBar(infile, outfile, disable=hide_progress) as progress: for result in T.tweet_lookup(infile, **kwargs): _write(result, outfile) tweet_ids = [t["id"] for t in result.get("data", [])] log.info("archived %s", ",".join(tweet_ids)) progress.update_with_result(result, error_resource_type="tweet") @twarc2.command("dehydrate") @click.option( "--id-type", default="tweets", type=click.Choice(["tweets", "users"], case_sensitive=False), help="IDs to extract - either 'tweets' or 'users'.", ) @command_line_progressbar_option @command_line_input_output_file_arguments @cli_api_error def dehydrate(infile, outfile, id_type, hide_progress): """ Extract tweet or user IDs from a dataset. """ if infile.name == outfile.name: click.echo( click.style( f"💔 Cannot extract files in-place, specify a different output file!", fg="red", ), err=True, ) return with FileSizeProgressBar(infile, outfile, disable=hide_progress) as progress: count = 0 unique_ids = set() for line in infile: count += 1 progress.update(len(line)) # ignore empty lines line = line.strip() if not line: continue try: for tweet in ensure_flattened(json.loads(line)): if id_type == "tweets": click.echo(tweet["id"], file=outfile) unique_ids.add(tweet["id"]) elif id_type == "users": click.echo(tweet["author_id"], file=outfile) unique_ids.add(tweet["author_id"]) except KeyError as e: click.echo( f"No {id_type} ID found in JSON data on line {count}", err=True ) break except ValueError as e: click.echo(f"Unexpected JSON data on line {count}", err=True) break except json.decoder.JSONDecodeError as e: click.echo(f"Invalid JSON on line {count}", err=True) break click.echo( f"ℹ️ Parsed {len(unique_ids)} {id_type} IDs from {count} lines in {infile.name} file.", err=True, ) @twarc2.command("users") @command_line_expansions_shortcuts @command_line_expansions_options @click.option("--usernames", is_flag=True, default=False) @command_line_progressbar_option @command_line_input_output_file_arguments @click.pass_obj @cli_api_error def users(T, infile, outfile, usernames, hide_progress, **kwargs): """ Get data for user ids or usernames. """ kwargs = _process_expansions_shortcuts(kwargs) # Also remove media poll and place from kwargs, these are not valid for this endpoint: kwargs.pop("media_fields", None) kwargs.pop("poll_fields", None) kwargs.pop("place_fields", None) with FileLineProgressBar(infile, outfile, disable=hide_progress) as progress: for result in T.user_lookup(infile, usernames, **kwargs): _write(result, outfile) if usernames: progress.update_with_result( result, field="username", error_resource_type="user", error_parameter="usernames", ) else: progress.update_with_result(result, error_resource_type="user") @twarc2.command("user") @command_line_expansions_shortcuts @command_line_expansions_options @click.argument("name-or-id", type=click.Choice(["name", "id"])) @click.argument("user", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def user(T, name_or_id, user, outfile, **kwargs): """ Get the profile data for a single user by either username or ID. To look up a user by ID: twarc2 user id 12 To look up a user by username: twarc2 user name jack """ kwargs = _process_expansions_shortcuts(kwargs) # Also remove media poll and place from kwargs, these are not valid for this endpoint: kwargs.pop("media_fields", None) kwargs.pop("poll_fields", None) kwargs.pop("place_fields", None) username = name_or_id == "name" user_data = list(T.user_lookup([user], username, **kwargs)) _write(user_data, outfile) @twarc2.command("mentions") @command_line_search_options @command_line_expansions_shortcuts @command_line_expansions_options @command_line_progressbar_option @click.argument("user_id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def mentions(T, user_id, outfile, hide_progress, **kwargs): """ Retrieve max of 800 of the most recent tweets mentioning the given user. """ kwargs = _process_expansions_shortcuts(kwargs) with tqdm(disable=hide_progress, total=800) as progress: for result in T.mentions(user_id, **kwargs): _write(result, outfile) progress.update(len(result.get("data", []))) else: if progress.n > 800: progress.desc = f"API limit reached with {progress.n} tweets" progress.n = 800 else: progress.desc = f"Set limit reached with {progress.n} tweets" @twarc2.command("timeline") @command_line_search_options @command_line_timelines_options @command_line_expansions_shortcuts @command_line_expansions_options @command_line_progressbar_option @click.option("--limit", default=0, help="Maximum number of tweets to return") @click.option( "--sort-order", type=click.Choice(["recency", "relevancy"]), help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")', ) @click.argument("user_id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def timeline( T, user_id, outfile, since_id, until_id, start_time, end_time, use_search, limit, exclude_retweets, exclude_replies, hide_progress, sort_order, **kwargs, ): """ Retrieve recent tweets for the given user. """ kwargs = _process_expansions_shortcuts(kwargs) count = 0 user = T._ensure_user(user_id) # It's possible to skip this to optimize more if use_search or (start_time or end_time) or (since_id or until_id): pbar = TimestampProgressBar # Infer start time as the user created time if not using ids if start_time is None and (since_id is None and until_id is None): start_time = datetime.datetime.strptime( user["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ" ) # Infer since_id as user created time if using ids if start_time is None and since_id is None: infer_id = _millis2snowflake( _date2millis( datetime.datetime.strptime( user["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ" ) ) ) # Snowflake epoch is 1288834974657 so if older, just set it to "1" since_id = infer_id if infer_id > 0 else 1 pbar_params = { "since_id": since_id, "until_id": until_id, "start_time": start_time, "end_time": end_time, "disable": hide_progress, } else: pbar = tqdm pbar_params = { "disable": hide_progress, "total": user["public_metrics"]["tweet_count"], } tweets = _timeline_tweets( T, use_search=use_search, user_id=user_id, since_id=since_id, until_id=until_id, start_time=start_time, end_time=end_time, exclude_retweets=exclude_retweets, exclude_replies=exclude_replies, sort_order=sort_order, **kwargs, ) with pbar(**pbar_params) as progress: for result in tweets: _write(result, outfile) count += len(result["data"]) if isinstance(progress, TimestampProgressBar): progress.update_with_result(result) else: progress.update(len(result["data"])) if limit != 0 and count >= limit: # Display message when stopped early progress.desc = f"Set --limit of {limit} reached" break else: if isinstance(progress, TimestampProgressBar): progress.early_stop = False if not use_search and user["public_metrics"]["tweet_count"] > 3200: progress.desc = f"API limit of 3200 reached" @twarc2.command("timelines") @click.option("--limit", default=0, help="Maximum number of tweets to return") @click.option( "--timeline-limit", default=0, help="Maximum number of tweets to return per-timeline", ) @click.option( "--sort-order", type=click.Choice(["recency", "relevancy"]), help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")', ) @command_line_search_options @command_line_timelines_options @command_line_expansions_shortcuts @command_line_expansions_options @command_line_progressbar_option @command_line_input_output_file_arguments @click.pass_obj def timelines( T, infile, outfile, limit, timeline_limit, use_search, sort_order, hide_progress, **kwargs, ): """ Fetch the timelines of every user in an input source of tweets. If the input is a line oriented text file of user ids or usernames that will be used instead. The infile can be: - A file containing one user id per line (either quoted or unquoted) - A JSONL file containing tweets collected in the Twitter API V2 format """ total_count = 0 line_count = 0 seen = set() kwargs = _process_expansions_shortcuts(kwargs) with FileLineProgressBar(infile, outfile, disable=hide_progress) as progress: for line in infile: progress.update() line_count += 1 line = line.strip() if line == "": log.warn("skipping blank line on line %s", line_count) continue users = None try: # assume this the line contains some tweet json data = json.loads(line) # if it parsed as a string or int assume it's a username if isinstance(data, str) or isinstance(data, int): users = set([line]) # otherwise try to flatten the data and get the user ids else: try: users = set([t["author"]["id"] for t in ensure_flattened(data)]) except (KeyError, ValueError): log.warn( "ignored line %s which didn't contain users", line_count ) continue except json.JSONDecodeError: # maybe it's a single user? users = set([line]) if users is None: click.echo( click.style( f"unable to find user or users on line {line_count}", fg="red", ), err=True, ) break for user in users: # only process a given user once if user in seen: log.info("already processed %s, skipping", user) continue # ignore what don't appear to be a username or user id since # they can cause the Twitter API to throw a 400 error if not re.match(r"^((\w{1,15})|(\d+))$", user): log.warn( 'invalid username or user id "%s" on line %s', line, line_count ) continue seen.add(user) tweets = _timeline_tweets( T, use_search=use_search, sort_order=sort_order, user_id=user, **kwargs, ) timeline_count = 0 for response in tweets: _write(response, outfile) timeline_count += len(response["data"]) if timeline_limit != 0 and timeline_count >= timeline_limit: break total_count += len(response["data"]) if limit != 0 and total_count >= limit: return def _timeline_tweets( T, use_search, user_id, since_id, until_id, start_time, end_time, exclude_retweets, exclude_replies, sort_order, **kwargs, ): if use_search: q = f"from:{user_id}" if exclude_retweets and "-is:retweet" not in q: q += " -is:retweet" if exclude_replies and "-is:reply" not in q: q += " -is:reply" tweets = T.search_all( query=q, since_id=since_id, until_id=until_id, start_time=start_time, end_time=end_time, sort_order=sort_order, **kwargs, ) else: tweets = T.timeline( user=user_id, since_id=since_id, until_id=until_id, start_time=start_time, end_time=end_time, exclude_retweets=exclude_retweets, exclude_replies=exclude_replies, **kwargs, ) return tweets @twarc2.command("searches") @command_line_search_options @command_line_search_archive_options @click.option( "--sort-order", type=click.Choice(["recency", "relevancy"]), help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")', ) @click.option( "--counts-only", is_flag=True, default=False, help="Only retrieve counts of tweets matching the search, not the tweets themselves. " "outfile will be a CSV containing the counts for all of the queries in the input file.", ) @click.option( "--combine-queries", is_flag=True, default=False, help="""Merge consecutive queries into a single OR query. For example, if the three rows in your file are: banana, apple, pear then a single query ((banana) OR (apple) OR (pear)) will be issued. """, ) @click.option( "--granularity", default="day", type=click.Choice(["day", "hour", "minute"], case_sensitive=False), help="Aggregation level for counts (only used when --count-only is used). Can be one of: day, hour, minute. Default is day.", ) @command_line_expansions_shortcuts @command_line_expansions_options @command_line_progressbar_option @command_line_input_output_file_arguments @click.pass_obj def searches( T, infile, outfile, limit, max_results, since_id, until_id, start_time, end_time, archive, counts_only, granularity, combine_queries, hide_progress, sort_order, **kwargs, ): """ Execute each search in the input file, one at a time. The infile must be a file containing one query per line. Each line will be passed through directly to the Twitter API - unlike the timelines command quotes will not be removed. Input queries will be deduplicated - if the same literal query is present in the file, it will still only be run once. It is recommended that this command first be run with --counts-only, to check that each of the queries is retrieving the volume of tweets expected, and to avoid consuming quota unnecessarily. """ line_count = 0 seen = set() kwargs = _process_expansions_shortcuts(kwargs) # Make sure times are always in UTC, click sometimes doesn't add timezone: if start_time is not None and start_time.tzinfo is None: start_time = start_time.replace(tzinfo=timezone.utc) if end_time is not None and end_time.tzinfo is None: end_time = end_time.replace(tzinfo=timezone.utc) # Standard API max query length max_query_length = 512 # TODO: this duplicates existing logic in _search, but _search is too # specific to be reused here. if archive: # start time defaults to the beginning of Twitter to override the # default of the last month. Only do this if start_time is not already # specified and since_id and until_id aren't being used if start_time is None and since_id is None and until_id is None: start_time = datetime.datetime(2006, 3, 21, tzinfo=datetime.timezone.utc) # Academic track let's you use longer queries max_query_length = 1024 if counts_only: api_method = T.counts_all if archive else T.counts_recent kwargs.pop("expansions", None) kwargs.pop("tweet_fields", None) kwargs.pop("user_fields", None) kwargs.pop("media_fields", None) kwargs.pop("poll_fields", None) kwargs.pop("place_fields", None) kwargs.pop("sort_order", None) kwargs = { **kwargs, **{ "since_id": since_id, "until_id": until_id, "start_time": start_time, "end_time": end_time, "granularity": granularity, }, } # Write the header for the CSV output click.echo(f"query,start,end,{granularity}_count", file=outfile) else: api_method = T.search_all if archive else T.search_recent kwargs = { **kwargs, **{ "since_id": since_id, "until_id": until_id, "start_time": start_time, "end_time": end_time, "max_results": max_results, "sort_order": sort_order, }, } # TODO: Validate the queries are all valid length before beginning and report errors # TODO: Needs an inputlines progress bar instead, as the queries are variable # size. with FileLineProgressBar(infile, outfile, disable=hide_progress) as progress: merged_query = "" extended_query = None query = None for query in infile: query = query.strip() progress.update(1) line_count += 1 if query == "": log.warn("skipping blank line on line %s", line_count) continue if len(query) >= max_query_length: log.warn(f"skipping too long query {query} on line {line_count}") continue if query in seen: log.info("already processed %s, skipping", query) continue seen.add(query) retrieved = 0 if combine_queries and merged_query: extended_query = f"{merged_query} OR ({query})" # We've exceeded the limit, so now we can issue # the merged query. if len(extended_query) >= max_query_length: issue_query = merged_query merged_query = f"({query})" else: # We haven't exceed the length yet, so accept the addon merged_query = extended_query continue elif combine_queries: merged_query = f"({query})" continue else: # This is the normal case - we are not doing any combination. issue_query = query log.info(f'Beginning search for "{issue_query}"') response = api_method(issue_query, **kwargs) for result in response: if counts_only: for r in result["data"]: click.echo( f'{issue_query},{r["start"]},{r["end"]},{r["tweet_count"]}', file=outfile, ) else: # Apply the limit if not counting _write(result, outfile) retrieved += len(result["data"]) if limit and (retrieved >= limit): break # Make sure to process the final batch of queries if using the combined strategy if combine_queries and ( merged_query == extended_query or merged_query == f"({query})" ): log.info(f'Beginning search for "{merged_query}"') response = api_method(merged_query, **kwargs) for result in response: if counts_only: for r in result["data"]: click.echo( f'{merged_query},{r["start"]},{r["end"]},{r["tweet_count"]}', file=outfile, ) else: # Apply the limit if not counting _write(result, outfile) retrieved += len(result["data"]) if limit and (retrieved >= limit): break @twarc2.command("conversation") @click.option( "--sort-order", type=click.Choice(["recency", "relevancy"]), help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")', ) @command_line_search_options @command_line_search_archive_options @command_line_expansions_shortcuts @command_line_expansions_options @command_line_progressbar_option @click.argument("tweet_id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def conversation( T, tweet_id, outfile, **kwargs, ): """ Retrieve a conversation thread using the tweet id. """ kwargs = _process_expansions_shortcuts(kwargs) q = f"conversation_id:{tweet_id}" return _search( T, q, outfile, **kwargs, ) @twarc2.command("conversations") @click.option( "--conversation-limit", default=0, help="Maximum number of tweets to return per-conversation", ) @click.option( "--sort-order", type=click.Choice(["recency", "relevancy"]), help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")', ) @command_line_search_options @command_line_search_archive_options @command_line_expansions_shortcuts @command_line_expansions_options @command_line_progressbar_option @command_line_input_output_file_arguments @click.pass_obj @cli_api_error def conversations( T, infile, outfile, archive, limit, conversation_limit, hide_progress, **kwargs ): """ Fetch the full conversation threads that the input tweets are a part of. Alternatively the input can be a line oriented file of conversation ids. """ kwargs = _process_expansions_shortcuts(kwargs) # keep track of converstation ids that have been fetched so that they # aren't fetched twice seen = set() # use the archive or recent search? search = T.search_all if archive else T.search_recent count = 0 stop = False with FileLineProgressBar(infile, outfile, disable=hide_progress) as progress: for line in infile: progress.update() conv_ids = [] # stop will get set when the total tweet limit has been met if stop: break # get a specific conversation id line = line.strip() if line and re.match(r"^\d+$", line): if line in seen: continue conv_ids = [line] # generate all conversation_ids that are referenced in tweets input elif line: def f(): for tweet in ensure_flattened(json.loads(line)): yield tweet.get("conversation_id") conv_ids = f() # output results while paying attention to the set limits conv_count = 0 for conv_id in conv_ids: if conv_id in seen: log.info(f"already fetched conversation_id {conv_id}") seen.add(conv_id) conv_count = 0 log.info(f"fetching conversation {conv_id}") for result in search(f"conversation_id:{conv_id}", **kwargs): _write(result, outfile, False) count += len(result["data"]) if limit != 0 and count >= limit: log.info(f"reached tweet limit of {limit}") stop = True break conv_count += len(result["data"]) if conversation_limit != 0 and conv_count >= conversation_limit: log.info(f"reached conversation limit {conversation_limit}") break @twarc2.command("flatten") @command_line_progressbar_option @command_line_input_output_file_arguments @cli_api_error def flatten(infile, outfile, hide_progress): """ "Flatten" tweets, or move expansions inline with tweet objects and ensure that each line of output is a single tweet. """ if infile.name == outfile.name: click.echo( click.style( f"💔 Cannot flatten files in-place, specify a different output file!", fg="red", ), err=True, ) return with FileSizeProgressBar(infile, outfile, disable=hide_progress) as progress: for line in infile: for tweet in ensure_flattened(json.loads(line)): _write(tweet, outfile, False) progress.update(len(line)) @twarc2.command("places") @click.option( "--type", "search_type", type=click.Choice(["name", "geo", "ip"]), default="name", help="How to search for places (defaults to name)", ) @click.option( "--granularity", type=click.Choice(["neighborhood", "city", "admin", "country"]), default="neighborhood", help="What type of places to search for (defaults to neighborhood)", ) @click.option("--max-results", type=int, help="Maximum results to return") @click.option("--json", is_flag=True, help="Output raw JSON response") @click.argument("value") @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def places(T, value, outfile, search_type, granularity, max_results, json): """ Search for places by place name, geo coordinates or ip address. """ params = {"granularity": granularity} if search_type == "name": params["query"] = value elif search_type == "ip": params["ip"] = value elif search_type == "geo": try: lat, lon = list(map(float, value.split(","))) params = {"lat": lat, "lon": lon} except: click.echo("--geo must be lat,lon", err=True) if max_results: params["max_results"] = max_results result = T.geo(**params) if "errors" in result: click.echo(_error_str(result["errors"]), err=True) elif json: _write(result, outfile) else: for place in result["result"]["places"]: if granularity == "country": line = "{0} [id={1}]".format(place["country"], place["id"]) else: line = "{0}, {1} [id={2}]".format( place["full_name"], place["country"], place["id"] ) click.echo(line) @twarc2.command("stream") @click.option("--limit", default=0, help="Maximum number of tweets to return") @command_line_expansions_shortcuts @command_line_expansions_options @click.argument("outfile", type=click.File("a+"), default="-") @click.pass_obj @cli_api_error def stream(T, outfile, limit, **kwargs): """ Fetch tweets from the live stream. """ kwargs = _process_expansions_shortcuts(kwargs) event = threading.Event() count = 0 click.echo(click.style(f"Started a stream with rules:", fg="green"), err=True) _print_stream_rules(T) click.echo( click.style(f"Writing to {outfile.name}\nCTRL+C to stop...", fg="green"), err=True, ) for result in T.stream(event=event, **kwargs): count += 1 if limit != 0 and count == limit: log.info(f"reached limit {limit}") event.set() _write(result, outfile) if result and "data" in result: log.info("archived %s", result["data"]["id"]) @twarc2.group() @click.pass_obj def lists(T): """ Lists API support. """ pass @lists.command("lookup") @click.argument("list_id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.option("--pretty", is_flag=True, default=False, help="Pretty print the JSON") @click.option( "--list-fields", default=",".join(LIST_FIELDS), type=click.STRING, is_eager=True, help="Comma separated list of tweet fields to retrieve. Default is all available.", callback=_validate_expansions, ) @click.pass_obj @cli_api_error def lists_lookup(T, list_id, outfile, pretty, **kwargs): """ Look up a single list using its list id or URL. """ kwargs = _process_expansions_shortcuts(kwargs) if "https" in list_id: list_id = list_id.split("/")[-1] if not re.match(r"^\d+$", list_id): click.echo(click.style("Please enter a List URL or ID", fg="red"), err=True) result = T.list_lookup(list_id, **kwargs) _write(result, outfile, pretty=pretty) @lists.command("bulk-lookup") @command_line_input_output_file_arguments @command_line_progressbar_option @click.option( "--list-fields", default=",".join(LIST_FIELDS), type=click.STRING, is_eager=True, help="Comma separated list of fields about a list to retrieve. Default is all available.", callback=_validate_expansions, ) @click.pass_obj @cli_api_error def lists_bulk_lookup(T, infile, outfile, hide_progress, **kwargs): """ Look up the details of many lists given a file of IDs or URLs. """ kwargs = _process_expansions_shortcuts(kwargs) with FileLineProgressBar(infile, outfile, disable=hide_progress) as progress: for list_id in infile: progress.update() if "https" in list_id: list_id = list_id.split("/")[-1] if not re.match(r"^\d+$", list_id): click.echo( click.style("Skipping invalid List URL or ID: {line}", fg="red"), err=True, ) continue result = T.list_lookup(list_id.strip(), **kwargs) _write(result, outfile) @lists.command("all") @click.argument("user", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.option( "--list-fields", default=",".join(LIST_FIELDS), type=click.STRING, is_eager=True, help="Comma separated list of tweet fields to retrieve. Default is all available.", callback=_validate_expansions, ) @click.option( "--limit", default=0, help="Maximum number of lists to save. Default is all.", type=int, ) @command_line_progressbar_option @click.pass_obj @cli_api_error def lists_all(T, user, outfile, limit, hide_progress, **kwargs): """ Get all Lists that a user created or is subscribed to. You can use the `owned` or `followed` command to get just the lists created by the user, or just the lists followed by the user respectively. """ kwargs = _process_expansions_shortcuts(kwargs) _write_with_progress( func=T.owned_lists, user=user, outfile=outfile, limit=limit, hide_progress=hide_progress, progress_total=1, **kwargs, ) _write_with_progress( func=T.followed_lists, user=user, outfile=outfile, limit=limit, hide_progress=hide_progress, progress_total=1, **kwargs, ) @lists.command("owned") @click.argument("user", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.option( "--list-fields", default=",".join(LIST_FIELDS), type=click.STRING, is_eager=True, help="Comma separated list of tweet fields to retrieve. Default is all available.", callback=_validate_expansions, ) @click.option( "--limit", default=0, help="Maximum number of lists to save. Default is all.", type=int, ) @command_line_progressbar_option @click.pass_obj @cli_api_error def lists_owned(T, user, outfile, limit, hide_progress, **kwargs): """ Get all Lists that a user created. """ kwargs = _process_expansions_shortcuts(kwargs) _write_with_progress( func=T.owned_lists, user=user, outfile=outfile, limit=limit, hide_progress=hide_progress, progress_total=1, **kwargs, ) @lists.command("followed") @click.argument("user", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.option( "--list-fields", default=",".join(LIST_FIELDS), type=click.STRING, is_eager=True, help="Comma separated list of tweet fields to retrieve. Default is all available.", callback=_validate_expansions, ) @click.option( "--limit", default=0, help="Maximum number of lists to save. Default is all.", type=int, ) @command_line_progressbar_option @click.pass_obj @cli_api_error def lists_followed(T, user, outfile, limit, hide_progress, **kwargs): """ Get all Lists that a user is following. """ kwargs = _process_expansions_shortcuts(kwargs) _write_with_progress( func=T.followed_lists, user=user, outfile=outfile, limit=limit, hide_progress=hide_progress, progress_total=1, **kwargs, ) @lists.command("memberships") @click.argument("user", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.option( "--list-fields", default=",".join(LIST_FIELDS), type=click.STRING, is_eager=True, help="Comma separated list of tweet fields to retrieve. Default is all available.", callback=_validate_expansions, ) @click.option( "--limit", default=0, help="Maximum number of lists to save. Default is all.", type=int, ) @command_line_progressbar_option @click.pass_obj @cli_api_error def lists_memberships(T, user, outfile, limit, hide_progress, **kwargs): """ Get all Lists that a user is a member of. """ kwargs = _process_expansions_shortcuts(kwargs) lookup_total = 1 hide_progress = True if (outfile.name == "") else hide_progress if not hide_progress: target_user = T._ensure_user(user) lookup_total = target_user["public_metrics"]["listed_count"] _write_with_progress( func=T.list_memberships, user=user, outfile=outfile, limit=limit, hide_progress=hide_progress, progress_total=lookup_total, **kwargs, ) @lists.command("followers") @click.argument("list-id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.option( "--limit", default=0, help="Maximum number of lists to save. Default is all.", type=int, ) @command_line_expansions_options @command_line_progressbar_option @click.pass_obj @cli_api_error def lists_followers(T, list_id, outfile, limit, hide_progress, **kwargs): """ Get all Users that are following (subscribed) to a list. """ kwargs = _process_expansions_shortcuts(kwargs) # Also remove media poll and place from kwargs, these are not valid for this endpoint: kwargs.pop("media_fields", None) kwargs.pop("poll_fields", None) kwargs.pop("place_fields", None) _list = ensure_flattened(T.list_lookup(list_id))[-1] list_id = _list["id"] lookup_total = int(_list["follower_count"]) _write_with_progress( func=T.list_followers, list_id=list_id, outfile=outfile, limit=limit, hide_progress=hide_progress, progress_total=lookup_total, **kwargs, ) @lists.command("members") @click.argument("list-id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.option( "--limit", default=0, help="Maximum number of lists to save. Default is all.", type=int, ) @command_line_expansions_options @command_line_progressbar_option @click.pass_obj @cli_api_error def lists_members(T, list_id, outfile, limit, hide_progress, **kwargs): """ Get all Users that are members of a list. """ kwargs = _process_expansions_shortcuts(kwargs) # Also remove media poll and place from kwargs, these are not valid for this endpoint: kwargs.pop("media_fields", None) kwargs.pop("poll_fields", None) kwargs.pop("place_fields", None) _list = ensure_flattened(T.list_lookup(list_id))[-1] list_id = _list["id"] lookup_total = int(_list["member_count"]) _write_with_progress( func=T.list_members, list_id=list_id, outfile=outfile, limit=limit, hide_progress=hide_progress, progress_total=lookup_total, **kwargs, ) @lists.command("tweets") @click.argument("list-id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.option( "--limit", default=0, help="Maximum number of tweets to save. Default and max is last 800.", type=int, ) @command_line_expansions_options @command_line_progressbar_option @click.pass_obj @cli_api_error def lists_tweets(T, list_id, outfile, limit, hide_progress, **kwargs): """ Get up to the most recent 800 tweets posted by members of a list. """ kwargs = _process_expansions_shortcuts(kwargs) # Also remove media poll and place from kwargs, these are not valid for this endpoint: kwargs.pop("media_fields", None) kwargs.pop("poll_fields", None) kwargs.pop("place_fields", None) _write_with_progress( func=T.list_tweets, list_id=list_id, outfile=outfile, limit=limit, hide_progress=hide_progress, progress_total=800, **kwargs, ) @twarc2.group() @click.pass_obj def stream_rules(T): """ List, add and delete rules for your stream. """ pass @stream_rules.command("list") @click.option("--display-ids", is_flag=True, help="display the rule ids") @click.pass_obj @cli_api_error def list_stream_rules(T, display_ids): """ List all the active stream rules. """ _print_stream_rules(T, display_ids) def _print_stream_rules(T, display_ids=False): """ Output all the active stream rules """ result = T.get_stream_rules() if "data" not in result or len(result["data"]) == 0: click.echo( "No rules yet. Add them with " + click.style("twarc2 stream-rules add", bold=True), err=True, ) else: count = 0 for rule in result["data"]: if count > 5: count = 0 s = rule["value"] if "tag" in rule: s += f" (tag: {rule['tag']})" if display_ids: s += f" (id: {rule['id']})" click.echo(click.style(f"☑ {s}"), err=True) count += 1 @stream_rules.command("add") @click.pass_obj @click.option("--tag", type=str, help="a tag to help identify the rule") @click.argument("value", type=str) @cli_api_error def add_stream_rule(T, value, tag): """ Create a new stream rule to match a value. Rules can be grouped with optional tags. """ if tag: rules = [{"value": value, "tag": tag}] else: rules = [{"value": value}] results = T.add_stream_rules(rules) if "errors" in results: click.echo(_error_str(results["errors"]), err=True) else: click.echo(click.style(f"🚀 Added rule for ", fg="green") + f'"{value}"') @stream_rules.command("delete") @click.argument("value") @click.pass_obj @cli_api_error def delete_stream_rule(T, value): """ Delete the stream rule that matches a given value. """ # find the rule id result = T.get_stream_rules() if "data" not in result: click.echo(click.style("💔 There are no rules to delete!", fg="red"), err=True) else: rule_id = None for rule in result["data"]: if rule["value"] == value: rule_id = rule["id"] break if not rule_id: click.echo( click.style(f'🙃 No rule could be found for "{value}"', fg="red"), err=True, ) else: results = T.delete_stream_rule_ids([rule_id]) if "errors" in results: click.echo(_error_str(results["errors"]), err=True) else: click.echo(f"🗑 Deleted stream rule for {value}", color="green") @stream_rules.command("delete-all") @click.pass_obj @cli_api_error def delete_all(T): """ Delete all stream rules! """ result = T.get_stream_rules() if "data" not in result: click.echo(click.style("💔 There are no rules to delete!", fg="red"), err=True) else: rule_ids = [r["id"] for r in result["data"]] results = T.delete_stream_rule_ids(rule_ids) click.echo(f"🗑 Deleted {len(rule_ids)} rules.") @twarc2.group() @click.pass_obj def compliance_job(T): """ Create, retrieve and list batch compliance jobs for Tweets and Users. """ pass @compliance_job.command("list") @click.argument( "job_type", required=False, default=None, type=click.Choice(["tweets", "users"], case_sensitive=False), ) @click.option( "--status", default=None, type=click.Choice( ["created", "in_progress", "complete", "failed"], case_sensitive=False ), help="Filter by job status. Only one of 'created', 'in_progress', 'complete', 'failed' can be specified. If not set, returns all.", ) @command_line_verbose_options @click.pass_obj @cli_api_error def compliance_job_list(T, job_type, status, verbose, json_output): """ Returns a list of compliance jobs by job type and status. """ if job_type: job_result = T.compliance_job_list(job_type.lower(), status) results = job_result["data"] if "data" in job_result else [] else: tweets_result = T.compliance_job_list("tweets", status) users_result = T.compliance_job_list("users", status) tweets_jobs = tweets_result["data"] if "data" in tweets_result else [] users_jobs = users_result["data"] if "data" in users_result else [] results = tweets_jobs + users_jobs if json_output: click.echo(json.dumps(results)) return if len(results) == 0: job_type_message = "tweet or user" if job_type is None else job_type status_message = f' with Status "{status}"' if status else "" click.echo( click.style( f"🙃 There are no {job_type_message} compliance jobs{status_message}. To create a new job, see:\n twarc2 compliance-job create --help", fg="red", ), err=True, ) else: for job in results: _print_compliance_job(job, verbose) @compliance_job.command("get") @click.argument("job") @command_line_verbose_options @click.pass_obj @cli_api_error def compliance_job_get(T, job, verbose, json_output): """ Returns status and download information about the job ID. """ if json_output: result = T.compliance_job_get(job) click.echo(json.dumps(result)) return job = _get_job(T, job) if job is None: return _print_compliance_job(job, verbose) # Ask to download if complete if job["status"] == "complete": continue_download = input( f"This job is complete, download it now into the current folder? [y or n]? " ) if continue_download.lower()[0] == "y": _download_job(job) @compliance_job.command("create") @click.argument( "job_type", required=True, type=click.Choice(["tweets", "users"], case_sensitive=False), ) @click.argument("infile", type=click.Path(), required=True) @click.argument("outfile", type=click.Path(), required=False, default=None) @click.option("--job-name", type=str, help="A name or tag to help identify the job.") @click.option( "--wait/--no-wait", default=True, help="Wait for the job to finish and download the results. Wait by default.", ) @command_line_progressbar_option @click.pass_obj @cli_api_error def compliance_job_create(T, job_type, infile, outfile, job_name, wait, hide_progress): """ Create a new compliance job and upload tweet IDs. """ # Check for file contents: with open(infile) as f: try: int(f.readline()) except: click.echo( click.style( f"🙃 The file {infile} does not contain a list of IDs. Use:", fg="red", ), err=True, ) click.echo( click.style( f" twarc2 dehydrate --id-type {job_type} {infile} output_ids.txt", ), err=True, ) click.echo( click.style( f"to create a file with {job_type} IDs.", fg="red", ), err=True, ) return # Create a job (not resumable right now): _job = T.compliance_job_create(job_type, job_name)["data"] click.echo( click.style( f"Created a new {job_type} job {_job['id']}. Uploading {infile}.", fg="yellow", bold=True, ), err=True, ) # Upload the file with open(infile, "rb") as f: with tqdm( total=os.stat(infile).st_size, unit="B", unit_scale=True, unit_divisor=1024 ) as pbar: wrapped_file = CallbackIOWrapper(pbar.update, f, "read") requests.put( _job["upload_url"], data=wrapped_file, headers={"Content-Type": "text/plain"}, ) if wait: if _wait_for_job(T, _job): _download_job(_job, outfile, hide_progress) @compliance_job.command("download") @click.argument("job") @click.argument("outfile", type=click.Path(), required=False, default=None) @click.option( "--wait/--no-wait", default=True, help="Wait for the job to finish and download the results. Wait by default.", ) @command_line_progressbar_option @click.pass_obj @cli_api_error def compliance_job_download(T, job, outfile, wait, hide_progress): """ Download the compliance job with the specified ID. """ _job = _get_job(T, job) if _job is None: click.echo( click.style( f"Job {job} not found. List valid job IDs with 'twarc2 compliance-job list' or Retry submitting the job with 'twarc2 compliance-job create'", fg="red", bold=True, ), err=True, ) return if _job["status"] == "complete": _download_job(_job, outfile, hide_progress) elif _job["status"] == "expired" or _job["status"] == "failed": click.echo( click.style( f"Job {_job['id']} is '{_job['status']}'. Retry submitting the job with 'twarc2 compliance-job create'", fg="red", bold=True, ), err=True, ) return else: if not wait: click.echo( click.style( f"Job {_job['id']} is '{_job['status']}'. Use:\n twarc2 compliance-job get {_job['id']}\nto get the status. Or run:\n twarc2 compliance-job download {_job['id']}\nto wait for the job to complete.", fg="yellow", bold=True, ), err=True, ) else: if _wait_for_job(T, _job): _download_job(_job, outfile, hide_progress) def _get_job(T, job): """ Retrieve a job from the API by ID """ result = T.compliance_job_get(job) if "data" not in result: click.echo( click.style( f"Job {job} could not be found. List valid job IDs with 'twarc2 compliance-job list'", fg="red", bold=True, ), err=True, ) return None return result["data"] def _wait_for_job(T, job, hide_progress=False): """ Wait for the compliance job to complete """ if ( job is not None and "status" in job and (job["status"] == "failed" or job["status"] == "expired") ): click.echo( click.style( f"Stopped waiting for job... Job status is {job['status']}", fg="red", bold=True, ) ) return False click.echo( click.style( f"Waiting for job {job['id']} to complete. Press Ctrl+C to cancel.", fg="yellow", bold=True, ), err=True, ) start_time = datetime.datetime.now(datetime.timezone.utc) est_completion = ( datetime.datetime.strptime( job["estimated_completion"], "%Y-%m-%dT%H:%M:%S.%fZ" ).replace(tzinfo=datetime.timezone.utc) if "estimated_completion" in job else start_time ) seconds_wait = int((est_completion - start_time).total_seconds()) if seconds_wait <= 0: click.echo( click.style( f"Estimated completion time unknown, waiting 1 minute instead.", fg="yellow", bold=True, ), err=True, ) seconds_wait = 60 est_completion = datetime.datetime.now( datetime.timezone.utc ) + datetime.timedelta(seconds=60) with TimestampProgressBar( since_id=None, until_id=None, start_time=start_time, end_time=est_completion, disable=hide_progress, bar_format="{l_bar}{bar}| Waiting {n_time}/{total_time}{postfix}", ) as pbar: while True: try: pbar.refresh() pbar.reset() for i in range(seconds_wait * 10): pbar.update(100) time.sleep(0.1) job = _get_job(T, job["id"]) if job is not None and "status" in job: total_wait = humanize.naturaldelta( datetime.datetime.now(datetime.timezone.utc) - start_time ) pbar.set_postfix_str( f"Job Status: {job['status']}. Waited for: {total_wait}" ) if job["status"] == "complete": return True elif job["status"] == "in_progress" or job["status"] == "created": continue else: click.echo( click.style( f"Stopped waiting for job... Job status is {job['status']}", fg="red", bold=True, ) ) return False else: click.echo( click.style( f"Stopped waiting for job... Failed to retrieve job from API.", fg="red", bold=True, ), err=True, ) return False except KeyboardInterrupt: click.echo( click.style( "Stopped waiting for job... Run the command again to continue waiting.", fg="yellow", bold=True, ) ) return False def _download_job(job, outfile=None, hide_progress=False): """ Download the compliance job. """ click.echo( click.style( f"Job {job['id']} is '{job['status']}'. Downloading Results...", fg="yellow", bold=True, ), err=True, ) url = job["download_url"] if outfile is None: outfile = f"{job['type']}_compliance_{job['id']}.json" response = requests.get(url, stream=True) with open(outfile, "wb") as fout: with tqdm( disable=hide_progress, unit="B", unit_scale=True, unit_divisor=1024, miniters=1, total=int(response.headers.get("content-length", 0)), ) as pbar: pbar.set_postfix_str(outfile) for chunk in response.iter_content(chunk_size=4096): fout.write(chunk) pbar.update(len(chunk)) def _print_compliance_job(job, verbose=False): job_colour = "yellow" if job["status"] == "expired" or job["status"] == "failed": job_colour = "red" if job["status"] == "complete": job_colour = "green" time_now = datetime.datetime.now(datetime.timezone.utc) upload_exp = time_now - datetime.datetime.strptime( job["upload_expires_at"], "%Y-%m-%dT%H:%M:%S.%fZ" ).replace(tzinfo=datetime.timezone.utc) download_exp = time_now - datetime.datetime.strptime( job["download_expires_at"], "%Y-%m-%dT%H:%M:%S.%fZ" ).replace(tzinfo=datetime.timezone.utc) failure = "" if "error" in job: failure = job["error"] job_name = job["name"] if "name" in job else "Job" click.echo( click.style( f"📃 Type: \"{job['type']}\" ID: \"{job['id']}\" Name: \"{job_name}\" Status: \"{job['status']}\" {failure}", fg=job_colour, bold=True, ), err=True, ) if verbose: click.echo( click.style(f"Created at: {job['created_at']}"), err=True, ) click.echo( click.style(f"Resumable: {job['resumable']}"), err=True, ) upload_url = job["upload_url"] if upload_exp.total_seconds() < 0 else "Expired" click.echo( click.style( f"Upload Expiry: {humanize.naturaltime(upload_exp)} URL: {upload_url}" ), err=True, ) download_url = ( job["download_url"] if download_exp.total_seconds() < 0 else "Expired" ) click.echo( click.style( f"Download Expiry: {humanize.naturaltime(download_exp)} URL: {download_url}" ), err=True, ) def _rule_str(rule): s = f"id={rule['id']} value={rule['value']}" if "tag" in rule: s += f" tag={rule['tag']}" return s def _error_str(errors): # collapse all the error messages into a newline delimited red colored list # the passed in errors can be single error object or a list of objects, each # of which has an errors key that points to a list of error objects if type(errors) != list or "errors" not in errors: errors = [{"errors": errors}] parts = [] for error in errors: for part in error["errors"]: s = "💣 " if "message" in part: s += click.style(part["message"], fg="red") elif "title" in part: s += click.style(part["title"], fg="red") else: s = click.style("Unknown error", fg="red") if "type" in part: s += f" see: {part['type']}" parts.append(s) return click.style("\n".join(parts), fg="red") def _write(results, outfile, pretty=False): indent = 2 if pretty else None click.echo(json.dumps(results, indent=indent), file=outfile) def _write_with_progress( func, outfile, limit, hide_progress, progress_total=1, **kwargs ): """ Get results page by page and write them out with a progress bar """ count = 0 hide_progress = True if (outfile.name == "") else hide_progress with tqdm(disable=hide_progress, total=progress_total) as progress: results = func(**kwargs) for result in results: _write(result, outfile) count += len(result.get("data", [])) progress.update(len(result.get("data", []))) if limit != 0 and count >= limit: # Display message when stopped early progress.desc = f"Set --limit of {limit} reached" break # Finish the progress bar progress.update(progress.total - progress.n) ================================================ FILE: src/twarc/config.py ================================================ import logging import configobj # Adapted from click_config_file.configobj_provider so that we can store the # file path that the config was loaded from in order to log it later. log = logging class ConfigProvider: def __init__(self): self.file_path = None def __call__(self, file_path, cmd_name): self.file_path = file_path return configobj.ConfigObj(file_path, unrepr=True) ================================================ FILE: src/twarc/decorators.py ================================================ import time import logging from requests import HTTPError from requests.packages.urllib3.exceptions import ReadTimeoutError from requests.exceptions import ChunkedEncodingError, ReadTimeout, ContentDecodingError log = logging.getLogger("twarc") def rate_limit(f): """ A decorator to handle rate limiting from the Twitter API. If a rate limit error is encountered we will sleep until we can issue the API call again. """ def new_f(*args, **kwargs): errors = 0 while True: resp = f(*args, **kwargs) if resp.status_code == 200: errors = 0 return resp elif resp.status_code == 401: # Hack to retain the original exception, but augment it with # additional context for the user to interpret it. In a Python # 3 only future we can raise a new exception of the same type # with a new message from the old error. try: resp.raise_for_status() except HTTPError as e: message = ( "\nThis is a protected or locked account, or" + " the credentials provided are no longer valid." ) e.args = (e.args[0] + message,) + e.args[1:] log.warning("401 Authentication required for %s", resp.url) raise elif resp.status_code == 429: try: reset = int(resp.headers["x-rate-limit-reset"]) now = time.time() seconds = reset - now + 10 except KeyError: # gnip endpoint doesn't have x-rate-limit-reset seconds = 2 if seconds < 1: seconds = 10 log.warning("rate limit exceeded: sleeping %s secs", seconds) time.sleep(seconds) # Special case for Academic all archive search instability # If we hit a 503 for that specific endpoint, we sleep for a shorter amount # of time, and reduce the number of tweets per request. elif (resp.status_code == 503) & ( resp.url.startswith("https://api.twitter.com/2/tweets/search/all") ): errors += 1 if errors > 30: log.warning("too many errors from Twitter, giving up") resp.raise_for_status() # Shorter wait time than other endpoints for this specific case. Also # on the first error, only wait for the single second required by the # 1 request/s rate limit seconds = max(1, 15 * (errors - 1)) # Backoff the number of results retrieved for this request. old_page_size = kwargs["params"]["max_results"] kwargs["params"]["max_results"] = max(50, old_page_size // 2) log.warning( "%s from Twitter search/all API, sleeping %s and backing off to %s tweets/page", resp.status_code, seconds, kwargs["params"]["max_results"], ) time.sleep(seconds) elif resp.status_code >= 500: errors += 1 if errors > 30: log.warning("too many errors from Twitter, giving up") resp.raise_for_status() seconds = 60 * errors log.warning( "%s from Twitter API, sleeping %s", resp.status_code, seconds ) time.sleep(seconds) else: resp.raise_for_status() return new_f def catch_conn_reset(f): """ A decorator to handle connection reset errors even ones from pyOpenSSL until https://github.com/edsu/twarc/issues/72 is resolved It also handles ChunkedEncodingError which has been observed in the wild. """ try: import OpenSSL ConnectionError = OpenSSL.SSL.SysCallError except: ConnectionError = None def new_f(self, *args, **kwargs): # Only handle if pyOpenSSL is installed. if ConnectionError: try: return f(self, *args, **kwargs) except (ConnectionError, ChunkedEncodingError) as e: log.warning("caught connection reset error: %s", e) self.connect() return f(self, *args, **kwargs) else: return f(self, *args, **kwargs) return new_f def catch_timeout(f): """ A decorator to handle read timeouts from Twitter. """ def new_f(self, *args, **kwargs): try: return f(self, *args, **kwargs) except (ReadTimeout, ReadTimeoutError) as e: log.warning("caught read timeout: %s", e) self.connect() return f(self, *args, **kwargs) return new_f def catch_gzip_errors(f): """ A decorator to handle gzip encoding errors which have been known to happen during hydration. """ def new_f(self, *args, **kwargs): try: return f(self, *args, **kwargs) except ContentDecodingError as e: log.warning("caught gzip error: %s", e) self.connect() return f(self, *args, **kwargs) return new_f def interruptible_sleep(t, event=None): """ Sleeps for a specified duration, optionally stopping early for event. Returns True if interrupted """ log.info("sleeping %s", t) if event is None: time.sleep(t) return False else: return not event.wait(t) def filter_protected(f): """ filter_protected will filter out protected tweets and users unless explicitly requested not to. """ def new_f(self, *args, **kwargs): for obj in f(self, *args, **kwargs): if self.protected == False: if "user" in obj and obj["user"]["protected"]: continue elif "protected" in obj and obj["protected"]: continue yield obj return new_f ================================================ FILE: src/twarc/decorators2.py ================================================ import os import time import click import logging import requests import datetime import humanize from tqdm.auto import tqdm from functools import wraps log = logging.getLogger("twarc") def rate_limit(f, tries=30): """ A decorator to handle rate limiting from the Twitter v2 API. If a rate limit error is encountered we will sleep until we can issue the API call again. """ @wraps(f) def new_f(*args, **kwargs): errors = 0 while True: resp = f(*args, **kwargs) if resp.status_code in [200, 201]: errors = 0 return resp elif resp.status_code == 429: # Check the headers, and try to infer why we're hitting the # rate limit. Because the search/all endpoints also have a # 1r/s rate limit that isn't obvious in the headers, we need # to infer the reason for the rate limit. Note that this is # included to help debug problems with multiple concurrent # clients - this shouldn't be hit in normal of operation of a # single twarc client. remaining = int(resp.headers["x-rate-limit-remaining"]) # If we have a 429 rate limit, but there are remaining calls for # this endpoint, we've probably hit the 1r/s limit. if remaining: log.warning( "Hit the 1 request/second rate limit, sleeping for 10 seconds. " "This shouldn't happen with normal usage of twarc, and may indicate " "multiple clients interacting with the Twitter API at the " "same time." ) time.sleep(10) continue # Just a regular 15 minute window rate limit. else: reset = int(resp.headers["x-rate-limit-reset"]) now = time.time() # The time to sleep depends on having an accurate system time, # so check to see if there's something really bad happening # to warn the user. target_sleep_seconds = reset - now # Never sleep longer than 15 minutes, as that is the basis for # all of the read time based rate limits in the Twitter API seconds = min(901, max(10, (target_sleep_seconds + 10))) if target_sleep_seconds >= 900: # If we need to sleep for more than a rate limit period, the # system clock could be wrong. log.warning( "Detected overlong sleep interval - is your system clock accurate? " "An accurate system time is needed to calculate how long to sleep for, " "and data collection might be slowed. " f"The rate limit resets at {reset} and the current time is {now}." ) elif target_sleep_seconds < 0: # If we need to sleep for negative time something weird might be up. log.warning( "Detected negative sleep interval - is your system clock accurate? " "If your system time is running fast, rate limiting may not be " "effective. " f"The rate limit resets at {reset} and the current time is {now}." ) log.warning("rate limit exceeded: sleeping %s secs", seconds) time.sleep(seconds) elif resp.status_code >= 500: errors += 1 if errors > tries: log.warning(f"too many errors ({tries}) from Twitter, giving up") resp.raise_for_status() seconds = errors**2 log.warning( "caught %s from Twitter API, sleeping %s", resp.status_code, seconds ) time.sleep(seconds) else: log.error("Unexpected HTTP response: %s", resp) resp.raise_for_status() return new_f def catch_request_exceptions(f, tries=30): """ A decorator to handle all request exceptions. This decorator will catch *any* request level error, reconnect and try again. It does not handle HTTP protocol level errors (404, 500) etc. It will try up to tries times consecutively before giving up. A successful call to f will result in the try counter being reset to 0. """ # pyOpenSSL has been known to throw these connection errors that need to be # caught separately: https://github.com/edsu/twarc/issues/72 try: import OpenSSL ConnectionError = OpenSSL.SSL.SysCallError except: ConnectionError = requests.exceptions.ConnectionError @wraps(f) def new_f(self, *args, **kwargs): errors = 0 while errors < tries: try: resp = f(self, *args, **kwargs) errors = 0 return resp except (requests.exceptions.RequestException, ConnectionError) as e: # don't catch any HTTP errors since those are handled separately if isinstance(e, requests.exceptions.HTTPError): raise e errors += 1 log.warning("caught requests exception: %s", e) if errors > tries: log.error(f"giving up, too many request exceptions: {tries}") raise e seconds = errors**2 log.info("sleeping %s", seconds) time.sleep(seconds) self.connect() return new_f def interruptible_sleep(t, event=None): """ Sleeps for a specified duration, optionally stopping early for event. Returns True if interrupted """ log.info("sleeping %s", t) if event is None: time.sleep(t) return False else: return not event.wait(t) class cli_api_error: """ A decorator to catch HTTP errors for the command line. """ def __init__(self, f): self.f = f # this is needed for click help docs to work properly self.__doc__ = f.__doc__ def __call__(self, *args, **kwargs): try: return self.f(*args, **kwargs) except requests.exceptions.HTTPError as e: try: result = e.response.json() if "errors" in result: for error in result["errors"]: msg = error.get("message", "Unknown error") elif "title" in result: msg = result["title"] else: msg = "Unknown error" except ValueError: msg = f"Unable to parse {e.response.status_code} error as JSON: {e.response.text}" except InvalidAuthType as e: msg = "This command requires application authentication, try passing --app-auth" except ValueError as e: msg = str(e) click.echo( click.style("⚡ ", fg="yellow") + click.style(msg, fg="red"), err=True ) def requires_app_auth(f): """ Ensure that application authentication is set for calls that only work in that mode. """ @wraps(f) def new_f(self, *args, **kwargs): if self.auth_type != "application": raise InvalidAuthType( "This endpoint only works with application authentication" ) else: return f(self, *args, **kwargs) return new_f class InvalidAuthType(Exception): """ Raised when the endpoint called is not supported by the current auth type. """ class FileLineProgressBar(tqdm): """ A progress bar based on input file line count. Counts an input file by lines. This tries to read the entire file and count newlines in a robust way. """ def __init__(self, infile, outfile, **kwargs): disable = False if "disable" not in kwargs else kwargs["disable"] if infile is not None and (infile.name == ""): disable = True if outfile is not None and (outfile.name == ""): disable = True kwargs["disable"] = disable kwargs["miniters"] = 1 kwargs[ "bar_format" ] = "{l_bar}{bar}| Processed {n_fmt}/{total_fmt} lines of input file [{elapsed}<{remaining}, {rate_fmt}{postfix}]" # Warn for large (> 1 GB) input files: if not disable and (os.stat(infile.name).st_size / (1024 * 1024 * 1024)) > 1: click.echo( click.style( f"Input File Size is {os.stat(infile.name).st_size / (1024*1024):.2f} MB, it may take a while to process. CTRL+C to stop.", fg="yellow", bold=True, ), err=True, ) def total_lines(): with open(infile.name, "r", encoding="utf-8", errors="ignore") as f: return sum(1 for _ in f) kwargs["total"] = total_lines() if not disable else 1 super().__init__(**kwargs) def update_with_result( self, result, field="id", error_resource_type=None, error_parameter="ids" ): """ Update the progress bar appropriately, with a full API response. For convenience, and drop in compatibility with FileSizeProgressBar otherwise use tqdm's update(). """ try: if "data" in result: for item in result["data"]: self.update() if error_resource_type and "errors" in result: for error in result["errors"]: # Account for deleted data # Errors have very inconsistent format, missing fields for different types of errors... if ( "resource_type" in error and error["resource_type"] == error_resource_type ): if ( "parameter" in error and error["parameter"] == error_parameter ): self.update() # todo: hide or show this? # self.set_description( # "Errors encountered, results may be incomplete" # ) # print(error["value"], error["resource_type"], error["parameter"]) except Exception as e: log.error(f"Failed to update progress bar: {e}") class FileSizeProgressBar(tqdm): """ An input file size based progress bar. Counts an input file in bytes. This will also dig into the responses and add up the outputs to match the file size. Overrides `disable` parameter if file is a pipe. """ def __init__(self, infile, outfile, **kwargs): disable = False if "disable" not in kwargs else kwargs["disable"] if infile is not None and (infile.name == ""): disable = True if outfile is not None and (outfile.name == ""): disable = True kwargs["disable"] = disable kwargs["unit"] = "B" kwargs["unit_scale"] = True kwargs["unit_divisor"] = 1024 kwargs["miniters"] = 1 kwargs[ "bar_format" ] = "{l_bar}{bar}| Processed {n_fmt}/{total_fmt} of input file [{elapsed}<{remaining}, {rate_fmt}{postfix}]" kwargs["total"] = os.stat(infile.name).st_size if not disable else 1 super().__init__(**kwargs) def update_with_result( self, result, field="id", error_resource_type=None, error_parameter="ids" ): """ Update the progress bar appropriately, with a full API response. For convenience, otherwise use twdm's own update() method. """ try: if "data" in result: for item in result["data"]: # Use the length of the id / name and a newline to match original file self.update(len(item[field]) + len("\n")) if error_resource_type and "errors" in result: for error in result["errors"]: # Account for deleted data # Errors have very inconsistent format, missing fields for different types of errors... if ( "resource_type" in error and error["resource_type"] == error_resource_type ): if ( "parameter" in error and error["parameter"] == error_parameter ): self.update(len(error["value"]) + len("\n")) # todo: hide or show this? # self.set_description( # "Errors encountered, results may be incomplete" # ) # print(error["value"], error["resource_type"], error["parameter"]) except Exception as e: log.error(f"Failed to update progress bar: {e}") class TimestampProgressBar(tqdm): """ A Timestamp based progress bar. Counts timestamp ranges in milliseconds. This can be used to display a progress bar for tweet ids and time ranges. """ def __init__(self, since_id, until_id, start_time, end_time, **kwargs): self.early_stop = True self.tweet_count = 0 disable = False if "disable" not in kwargs else kwargs["disable"] kwargs["disable"] = disable if start_time is None and (since_id is None and until_id is None): start_time = datetime.datetime.now( datetime.timezone.utc ) - datetime.timedelta(days=7) if end_time is None and (since_id is None and until_id is None): end_time = datetime.datetime.now( datetime.timezone.utc ) - datetime.timedelta(seconds=30) if since_id and not until_id: until_id = _millis2snowflake( _date2millis(datetime.datetime.now(datetime.timezone.utc)) ) if until_id and not since_id: since_id = 1 total = ( _snowflake2millis(until_id) - _snowflake2millis(since_id) if (since_id and until_id) else _date2millis(end_time) - _date2millis(start_time) ) kwargs["miniters"] = 1 kwargs["total"] = total tweets_timeline_format = "{l_bar}{bar}| Processed {n_time}/{total_time} [{elapsed}<{remaining}, {tweet_count} tweets total {postfix}]" kwargs["bar_format"] = ( tweets_timeline_format if "bar_format" not in kwargs else kwargs["bar_format"] ) super().__init__(**kwargs) def update_with_dates(self, start_span, end_span): """ Update the progress bar with a start and end time span. """ try: if isinstance(start_span, str): start_span = datetime.datetime.strptime( start_span, "%Y-%m-%dT%H:%M:%S.%fZ" ) if isinstance(end_span, str): end_span = datetime.datetime.strptime(end_span, "%Y-%m-%dT%H:%M:%S.%fZ") n = _date2millis(end_span) - _date2millis(start_span) if self.n + n > self.total: self.n = self.total else: self.update(n) except Exception as e: log.error(f"Failed to update progress bar: {e}") def update_with_result(self, result): """ Update progress bar based on snowflake ids from an API response. """ try: newest_id = result["meta"]["newest_id"] oldest_id = result["meta"]["oldest_id"] n = _snowflake2millis(int(newest_id)) - _snowflake2millis(int(oldest_id)) self.update(n) self.tweet_count += len(result["data"]) except Exception as e: log.error(f"Failed to update progress bar: {e}") @property def format_dict(self): d = super(TimestampProgressBar, self).format_dict # original format dict tweets_per_second = int(self.tweet_count / d["elapsed"] if d["elapsed"] else 0) n_time = humanize.naturaldelta(datetime.timedelta(seconds=int(d["n"]) // 1000)) total_time = humanize.naturaldelta( datetime.timedelta(seconds=int(d["total"]) // 1000) ) d.update(n_time=n_time) d.update(total_time=total_time) d.update(tweet_count=self.tweet_count) d.update(tweets_per_second=tweets_per_second) return d def close(self): if not self.early_stop: # Finish the bar to 100% even if the last tweet ids do not cover the full time range self.update(self.total - self.n) super().close() def _date2millis(dt): return int(dt.timestamp() * 1000) def _millis2date(ms): return datetime.datetime.utcfromtimestamp(ms // 1000).replace( microsecond=ms % 1000 * 1000 ) def _snowflake2millis(snowflake_id): return (snowflake_id >> 22) + 1288834974657 def _millis2snowflake(ms): return (int(ms) - 1288834974657) << 22 ================================================ FILE: src/twarc/expansions.py ================================================ """ This module contains a list of the known Twitter V2+ API expansions and fields for each expansion, and a function flatten() for "flattening" a result set, including all expansions inline. ensure_flattened() can be used in tweet processing programs that need to make sure that data is flattened. """ import logging from itertools import chain from collections import defaultdict log = logging.getLogger("twarc") EXPANSIONS = [ "author_id", "in_reply_to_user_id", "referenced_tweets.id", "referenced_tweets.id.author_id", "entities.mentions.username", "attachments.poll_ids", "attachments.media_keys", "geo.place_id", "edit_history_tweet_ids", ] USER_FIELDS = [ "created_at", "description", "entities", "id", "location", "name", "pinned_tweet_id", "profile_image_url", "protected", "public_metrics", "url", "username", "verified", "verified_type", "withheld", ] TWEET_FIELDS = [ "attachments", "author_id", "context_annotations", "conversation_id", "created_at", "entities", "geo", "id", "in_reply_to_user_id", "lang", "public_metrics", # "non_public_metrics", # private # "organic_metrics", # private # "promoted_metrics", # private "text", "possibly_sensitive", "referenced_tweets", "reply_settings", "source", "withheld", "edit_controls", "edit_history_tweet_ids", ] MEDIA_FIELDS = [ "alt_text", "duration_ms", "height", "media_key", "preview_image_url", "type", "url", "width", "variants", # "non_public_metrics", # private # "organic_metrics", # private # "promoted_metrics", # private "public_metrics", ] POLL_FIELDS = ["duration_minutes", "end_datetime", "id", "options", "voting_status"] PLACE_FIELDS = [ "contained_within", "country", "country_code", "full_name", "geo", "id", "name", "place_type", ] LIST_FIELDS = [ "id", "name", "owner_id", "created_at", "member_count", "follower_count", "private", "description", ] def extract_includes(response, expansion, _id="id"): if "includes" in response and expansion in response["includes"]: return defaultdict( lambda: {}, {include[_id]: include for include in response["includes"][expansion]}, ) else: return defaultdict(lambda: {}) def flatten(response): """ Flatten an API response by moving all "included" entities inline with the tweets they are referenced from. flatten expects an entire page response from the API (data, includes, meta) and will raise a ValueError if what is passed in does not appear to be an API response. It will return a list of dictionaries where each dictionary represents a tweet. Empty objects will be returned for things that are missing in includes, which can happen when protected or delete users or tweets are referenced. """ # Users extracted both by id and by username for expanding mentions includes_users = defaultdict( lambda: {}, { **extract_includes(response, "users", "id"), **extract_includes(response, "users", "username"), }, ) # Media is by media_key, not id includes_media = extract_includes(response, "media", "media_key") includes_polls = extract_includes(response, "polls") includes_places = extract_includes(response, "places") # Tweets in includes will themselves be expanded includes_tweets = extract_includes(response, "tweets") # Errors are returned but unused here for now includes_errors = extract_includes(response, "errors") def expand_payload(payload): """ Recursively step through an object and sub objects and append extra data. Can be applied to any tweet, list of tweets, sub object of tweet etc. """ # Don't try to expand on primitive values, return strings as is: if isinstance(payload, (str, bool, int, float)): return payload # expand list items individually: elif isinstance(payload, list): payload = [expand_payload(item) for item in payload] return payload # Try to expand on dicts within dicts: elif isinstance(payload, dict): for key, value in payload.items(): payload[key] = expand_payload(value) if "author_id" in payload: payload["author"] = includes_users[payload["author_id"]] if "in_reply_to_user_id" in payload: payload["in_reply_to_user"] = includes_users[payload["in_reply_to_user_id"]] if "media_keys" in payload: payload["media"] = list( includes_media[media_key] for media_key in payload["media_keys"] ) if "poll_ids" in payload and len(payload["poll_ids"]) > 0: poll_id = payload["poll_ids"][-1] # only ever 1 poll per tweet. payload["poll"] = includes_polls[poll_id] if "geo" in payload and "place_id" in payload["geo"]: place_id = payload["geo"]["place_id"] payload["geo"] = {**payload["geo"], **includes_places[place_id]} if "mentions" in payload: payload["mentions"] = list( {**referenced_user, **includes_users[referenced_user["username"]]} for referenced_user in payload["mentions"] ) if "referenced_tweets" in payload: payload["referenced_tweets"] = list( {**referenced_tweet, **includes_tweets[referenced_tweet["id"]]} for referenced_tweet in payload["referenced_tweets"] ) if "pinned_tweet_id" in payload: payload["pinned_tweet"] = includes_tweets[payload["pinned_tweet_id"]] return payload # First expand the tweets in "includes", before processing actual result tweets: for included_id, included_tweet in extract_includes(response, "tweets").items(): includes_tweets[included_id] = expand_payload(included_tweet) # Now expand the list of tweets or an individual tweet in "data" tweets = [] if "data" in response: data = response["data"] if isinstance(data, list): tweets = expand_payload(response["data"]) elif isinstance(data, dict): tweets = [expand_payload(response["data"])] # Add the __twarc metadata and matching rules to each tweet if it's a result set if "__twarc" in response: for tweet in tweets: tweet["__twarc"] = response["__twarc"] if "matching_rules" in response: for tweet in tweets: tweet["matching_rules"] = response["matching_rules"] else: raise ValueError(f"missing data stanza in response: {response}") return tweets def ensure_flattened(data): """ Will ensure that the supplied data is "flattened". The input data can be a response from the Twitter API, a list of tweet dictionaries, or a single tweet dictionary. It will always return a list of tweet dictionaries. A ValueError will be thrown if the supplied data is not recognizable or it cannot be flattened. ensure_flattened is designed for use in twarc plugins and other tweet processing applications that want to operate on a stream of tweets, and examine included entities like users and tweets without hunting and pecking in the response data. """ # If it's a single response from the API, with data and includes, we flatten it: if isinstance(data, dict) and "data" in data and "includes" in data: return flatten(data) # If it's a single response with data, but without includes: elif isinstance(data, dict) and "data" in data and "includes" not in data: # flatten() will still work, just with {} empty expansions, log a warning. log.warning(f"Unable to expand dictionary without includes: {data}") return flatten(data) # If it's just an object with errors return an empty list elif ( isinstance(data, dict) and "data" not in data and "includes" not in data and "errors" in data ): return [] # If it's a single response and both "includes" and "data" are missing, it is already flattened elif isinstance(data, dict) and "data" not in data and "includes" not in data: return [data] # If it's a list of objects (could be list of responses, or tweets, or users): elif isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict): # Same as above, if "data" in data[0] and "includes" in data[0]: # but flatten each object individually and return a single list return list(chain.from_iterable([flatten(item) for item in data])) elif "data" in data[0] and "includes" not in data[0]: # same as above, log warnings and return a single list log.warning(f"Unable to expand dictionary without includes: {data[0]}") return list(chain.from_iterable([flatten(item) for item in data])) # Return already flattened data as is elif "data" not in data[0] and "includes" not in data[0]: return data # Unknown format, eg: list of lists, or primitive else: raise ValueError(f"Cannot flatten unrecognized data: {data}") ================================================ FILE: src/twarc/handshake.py ================================================ """ A function for asking the user for their Twitter API keys. """ import requests from requests_oauthlib import OAuth1 from urllib.parse import parse_qs def handshake(): # Default empty keys consumer_key = "" consumer_secret = "" access_token = "" access_token_secret = "" bearer_token = input( "Please enter your Bearer Token (leave blank to skip to API key configuration): " ) if bearer_token: continue_adding = input( "(Optional) Add API keys and secrets for user mode authentication [y or n]? " ) # Save a config with just the bearer_token if continue_adding.lower() != "y": return {"bearer_token": bearer_token} else: "Configure API keys and secrets." consumer_key = input("Please enter your API key: ") consumer_secret = input("Please enter your API secret: ") # verify that the keys work to get the bearer token url = "https://api.twitter.com/oauth2/token" params = {"grant_type": "client_credentials"} auth = requests.auth.HTTPBasicAuth(consumer_key, consumer_secret) try: resp = requests.post(url, params, auth=auth) resp.raise_for_status() result = resp.json() bearer_token = result["access_token"] except Exception as e: return None answered = False while not answered: print( "\nHow would you like twarc to obtain your user keys?\n\n1) generate access keys by visiting Twitter\n2) manually enter your access token and secret\n" ) answer = input("Please enter your choice [1 or 2] ") if answer == "1": answered = True generate = True elif answer == "2": answered = True generate = False if generate: request_token_url = "https://api.twitter.com/oauth/request_token" oauth = OAuth1(consumer_key, client_secret=consumer_secret) r = requests.post(url=request_token_url, auth=oauth) credentials = parse_qs(r.text) if not credentials: print("\nError: invalid credentials.") print( "Please check that you are copying and pasting correctly and try again.\n" ) return resource_owner_key = credentials.get("oauth_token")[0] resource_owner_secret = credentials.get("oauth_token_secret")[0] base_authorization_url = "https://api.twitter.com/oauth/authorize" authorize_url = base_authorization_url + "?oauth_token=" + resource_owner_key print( "\nPlease log into Twitter and visit this URL in your browser:\n%s" % authorize_url ) verifier = input( "\nAfter you have authorized the application please enter the displayed PIN: " ) access_token_url = "https://api.twitter.com/oauth/access_token" oauth = OAuth1( consumer_key, client_secret=consumer_secret, resource_owner_key=resource_owner_key, resource_owner_secret=resource_owner_secret, verifier=verifier, ) r = requests.post(url=access_token_url, auth=oauth) credentials = parse_qs(r.text) if not credentials: print("\nError: invalid PIN") print("Please check that you entered the PIN correctly and try again.\n") return access_token = resource_owner_key = credentials.get("oauth_token")[0] access_token_secret = credentials.get("oauth_token_secret")[0] screen_name = credentials.get("screen_name")[0] else: access_token = input("Enter your Access Token: ") access_token_secret = input("Enter your Access Token Secret: ") screen_name = "default" return { "consumer_key": consumer_key, "consumer_secret": consumer_secret, "access_token": access_token, "access_token_secret": access_token_secret, "bearer_token": bearer_token, } ================================================ FILE: src/twarc/json2csv.py ================================================ #!/usr/bin/env python import sys from dateutil.parser import parse as date_parse from six import string_types if sys.version_info[0] < 3: try: import unicodecsv as csv except ImportError: sys.exit("unicodecsv is required for python 2") else: import csv def get_headings(): return [ "id", "tweet_url", "created_at", "parsed_created_at", "user_screen_name", "text", "tweet_type", "coordinates", "hashtags", "media", "urls", "favorite_count", "in_reply_to_screen_name", "in_reply_to_status_id", "in_reply_to_user_id", "lang", "place", "possibly_sensitive", "retweet_count", "retweet_or_quote_id", "retweet_or_quote_screen_name", "retweet_or_quote_user_id", "source", "user_id", "user_created_at", "user_default_profile_image", "user_description", "user_favourites_count", "user_followers_count", "user_friends_count", "user_listed_count", "user_location", "user_name", "user_statuses_count", "user_time_zone", "user_urls", "user_verified", ] def get_row(t, excel=False): get = t.get user = t.get("user").get return [ get("id_str"), tweet_url(t), get("created_at"), date_parse(get("created_at")), user("screen_name"), text(t) if not excel else clean_str(text(t)), tweet_type(t), coordinates(t), hashtags(t), media(t), urls(t), favorite_count(t), get("in_reply_to_screen_name"), get("in_reply_to_status_id"), get("in_reply_to_user_id"), get("lang"), place(t), get("possibly_sensitive"), get("retweet_count"), retweet_id(t), retweet_screen_name(t), retweet_user_id(t), get("source"), user("id_str"), user("created_at"), user("default_profile_image"), user("description") if not excel else clean_str(user("description")), user("favourites_count"), user("followers_count"), user("friends_count"), user("listed_count"), user("location") if not excel else clean_str(user("location")), user("name") if not excel else clean_str(user("name")), user("statuses_count"), user("time_zone"), user_urls(t), user("verified"), ] def clean_str(string): if isinstance(string, string_types): return string.replace("\n", " ").replace("\r", "") return None def text(t): # need to look at original tweets for retweets for full text if t.get("retweeted_status"): t = t.get("retweeted_status") if "extended_tweet" in t: return t["extended_tweet"]["full_text"] elif "full_text" in t: return t["full_text"] else: return t["text"] def coordinates(t): if "coordinates" in t and t["coordinates"]: return "%f %f" % tuple(t["coordinates"]["coordinates"]) return None def hashtags(t): # If it's a retweet, the hashtags might be cutoff in the retweet object, so check # the enclosed original tweet for the full list. if "retweeted_status" in t: hashtags = t["retweeted_status"]["entities"]["hashtags"] else: hashtags = t["entities"]["hashtags"] return " ".join(h["text"] for h in hashtags) def media(t): if "extended_entities" in t and "media" in t["extended_entities"]: return " ".join([h["media_url_https"] for h in t["extended_entities"]["media"]]) elif "media" in t["entities"]: return " ".join([h["media_url_https"] for h in t["entities"]["media"]]) else: return None def urls(t): return " ".join([h["expanded_url"] or "" for h in t["entities"]["urls"]]) def place(t): if "place" in t and t["place"]: return t["place"]["full_name"] def retweet_id(t): if "retweeted_status" in t and t["retweeted_status"]: return t["retweeted_status"]["id_str"] elif "quoted_status" in t and t["quoted_status"]: return t["quoted_status"]["id_str"] def retweet_screen_name(t): if "retweeted_status" in t and t["retweeted_status"]: return t["retweeted_status"]["user"]["screen_name"] elif "quoted_status" in t and t["quoted_status"]: return t["quoted_status"]["user"]["screen_name"] def retweet_user_id(t): if "retweeted_status" in t and t["retweeted_status"]: return t["retweeted_status"]["user"]["id_str"] elif "quoted_status" in t and t["quoted_status"]: return t["quoted_status"]["user"]["id_str"] def favorite_count(t): if "retweeted_status" in t and t["retweeted_status"]: return t["retweeted_status"]["favorite_count"] else: return t["favorite_count"] def tweet_url(t): return "https://twitter.com/%s/status/%s" % (t["user"]["screen_name"], t["id_str"]) def user_urls(t): u = t.get("user") if not u: return None urls = [] if "entities" in u and "url" in u["entities"] and "urls" in u["entities"]["url"]: for url in u["entities"]["url"]["urls"]: if url["expanded_url"]: urls.append(url["expanded_url"]) return " ".join(urls) def tweet_type(t): # Determine the type of a tweet if t.get("in_reply_to_status_id"): return "reply" if "retweeted_status" in t: return "retweet" if "quoted_status" in t: return "quote" return "original" ================================================ FILE: src/twarc/version.py ================================================ import platform version = "2.14.1" user_agent = f"twarc/{version} ({platform.system()} {platform.machine()}) {platform.python_implementation()}/{platform.python_version()}" ================================================ FILE: test_twarc.py ================================================ import os import re import json import time import dotenv import pytest import logging import datetime dotenv.load_dotenv() try: from unittest.mock import patch, call, MagicMock # Python 3 except ImportError: from mock import patch, call, MagicMock # Python 2 from requests_oauthlib import OAuth1Session import requests import twarc from twarc import json2csv """ You will need to have these environment variables set to run these tests: * CONSUMER_KEY * CONSUMER_SECRET * ACCESS_TOKEN * ACCESS_TOKEN_SECRET To run the premium tests, you will need to set the following environment variable: TWITTER_ENV To run the gnip test, you will need to set the following environment variables: GNIP_ENV GNIP_ACCOUNT GNIP_USERNAME GNIP_PASSWORD """ logging.basicConfig(filename="test.log", level=logging.INFO) T = twarc.Twarc() def test_search(): count = 0 for tweet in T.search("obama"): assert tweet["id_str"] count += 1 if count == 10: break assert count == 10 def test_search_max_pages(): tweets = list(T.search("obama", max_pages=1)) assert 0 < len(tweets) <= 100 tweets = list(T.search("obama", max_pages=2)) assert 100 < len(tweets) <= 200 def test_since_id(): for tweet in T.search("obama"): id = tweet["id_str"] break assert id time.sleep(5) for tweet in T.search("obama", since_id=id): assert tweet["id_str"] > id def test_max_id(): for tweet in T.search("obama"): id = tweet["id_str"] break assert id time.sleep(5) count = 0 for tweet in T.search("obama", max_id=id): count += 1 assert tweet["id_str"] <= id if count > 100: break def test_max_and_since_ids(): max_id = since_id = None count = 0 for tweet in T.search("obama"): count += 1 if not max_id: max_id = tweet["id_str"] since_id = tweet["id_str"] if count > 500: break count = 0 for tweet in T.search("obama", max_id=max_id, since_id=since_id): count += 1 assert tweet["id_str"] <= max_id assert tweet["id_str"] > since_id def test_paging(): # pages are 100 tweets big so if we can get 500 paging is working count = 0 for tweet in T.search("obama"): count += 1 if count == 500: break assert count == 500 def test_geocode(): # look for tweets from New York ; the search radius is larger than NYC # so hopefully we'll find one from New York in the first 500? count = 0 found = False for tweet in T.search(None, geocode="40.7484,-73.9857,1mi"): if (tweet["place"] or {}).get("name") == "Manhattan": found = True break if count > 500: break count += 1 assert found @pytest.mark.skip(reason="v1.1 filter API disabled March 2023") def test_track(): tweet = next(T.filter(track="obama")) json_str = json.dumps(tweet) assert re.search("obama", json_str, re.IGNORECASE) # reconnect to close streaming connection for other tests T.connect() @pytest.mark.skip(reason="v1.1 filter API disabled March 2023") def test_keepalive(): for event in T.filter(track="abcdefghiklmno", record_keepalive=True): if event == "keep-alive": break # reconnect to close streaming connection for other tests T.connect() @pytest.mark.skip(reason="v1.1 filter API disabled March 2023") def test_follow(): user_ids = [ "87818409", # @guardian "428333", # @cnnbrk "5402612", # @BBCBreaking "2467791", # @washingtonpost "1020058453", # @BuzzFeedNews "23484039", # WSJbreakingnews "384438102", # ABCNewsLive "87416722", # SkyNewsBreak ] found = False for tweet in T.filter(follow=",".join(user_ids)): assert tweet["id_str"] if tweet["user"]["id_str"] in user_ids: found = True elif tweet["in_reply_to_user_id_str"] in user_ids: found = True elif tweet["retweeted_status"]["user"]["id_str"] in user_ids: found = True elif ( "quoted_status" in tweet and tweet["quoted_status"]["user"]["id_str"] in user_ids ): found = True break if not found: logging.warn("couldn't find user in response: %s", json.dumps(tweet, indent=2)) assert found # reconnect to close streaming connection for other tests T.connect() @pytest.mark.skip(reason="v1.1 filter API disabled March 2023") def test_locations(): # look for tweets from New York ; the bounding box is larger than NYC # so hopefully we'll find one from New York in the first 100? count = 0 found = False for tweet in T.filter(locations="-74,40,-73,41"): if tweet["place"]["name"] == "Manhattan": found = True break if count > 100: break count += 1 assert found # reconnect to close streaming connection for other tests T.connect() @pytest.mark.skip(reason="v1.1 filter API disabled March 2023") def test_languages(): count = 0 ok = True langs = ["fr", "es"] for tweet in T.filter("paris,madrid", lang=langs): if tweet["lang"] not in langs: ok = False break if count > 25: break count += 1 assert ok # reconnect to close streaming connection for other tests T.connect() def test_timeline_by_user_id(): # looks for recent tweets and checks if tweets are of provided user_id user_id = "87818409" for tweet in T.timeline(user_id=user_id): assert tweet["user"]["id_str"] == user_id # Make sure that passing an int user_id behaves as expected. Issue #235 user_id = 87818409 all_tweets = list(T.timeline(user_id=user_id)) assert len(all_tweets) for tweet in all_tweets: assert tweet["user"]["id"] == user_id def test_timeline_max_pages(): # looks for recent tweets and checks if tweets are of provided user_id user_id = "87818409" first_page = list(T.timeline(user_id=user_id, max_pages=1)) assert 0 < len(first_page) <= 200 all_pages = list(T.timeline(user_id=user_id)) assert len(all_pages) > len(first_page) def test_timeline_by_screen_name(): # looks for recent tweets and checks if tweets are of provided screen_name screen_name = "guardian" for tweet in T.timeline(screen_name=screen_name): assert tweet["user"]["screen_name"].lower() == screen_name.lower() def test_home_timeline(): found = False for tweet in T.timeline(): found = True break assert found def test_timeline_arg_handling(): # Confirm that only user_id *or* screen_name is valid for timeline screen_name = "guardian" user_id = "87818409" with pytest.raises(ValueError): for t in T.timeline(screen_name=screen_name, user_id=user_id): pass def test_timeline_with_since_id(): count = 0 tweet_id = None for tweet in T.timeline(screen_name="guardian"): tweet_id = tweet["id_str"] count += 1 if count > 10: break tweets = list(T.timeline(screen_name="guardian", since_id=tweet_id)) assert len(tweets) == 10 def test_trends_available(): # fetches all available trend regions and checks presence of likely member trends = T.trends_available() worldwide = [t for t in trends if t["placeType"]["name"] == "Supername"] assert worldwide[0]["name"] == "Worldwide" def test_trends_place(): # fetches recent trends for Amsterdam, WOEID 727232 trends = T.trends_place(727232) assert len(list(trends[0]["trends"])) > 0 def test_trends_closest(): # fetches regions bounding the specified lat and lon trends = T.trends_closest(38.883137, -76.990228) assert len(list(trends)) > 0 def test_trends_place_exclude(): # fetches recent trends for Amsterdam, WOEID 727232, sans hashtags trends = T.trends_place(727232, exclude="hashtags")[0]["trends"] hashtag_trends = [t for t in trends if t["name"].startswith("#")] assert len(hashtag_trends) == 0 def test_follower_ids(): count = 0 for id in T.follower_ids("justinbieber"): count += 1 if count == 10001: break assert count == 10001 def test_follower_ids_with_user_id(): count = 0 for id in T.follower_ids(27260086): count += 1 if count > 10001: break assert count > 10001 def test_follower_ids_max_pages(): ids = list(T.follower_ids(813286, max_pages=1)) assert 0 < len(ids) <= 5000 ids = list(T.follower_ids(813286, max_pages=2)) assert 5000 < len(ids) <= 10000 def test_friend_ids(): count = 0 for id in T.friend_ids("justinbieber"): count += 1 if count == 10001: break assert count == 10001 def test_friend_ids_with_user_id(): count = 0 for id in T.friend_ids(27260086): count += 1 if count > 10001: break assert count > 10001 def test_friend_ids_max_pages(): ids = list(T.friend_ids(27260086, max_pages=1)) assert 0 < len(ids) <= 5000 ids = list(T.friend_ids(27260086, max_pages=2)) assert 5000 < len(ids) <= 10000 def test_user_lookup_by_user_id(): # looks for the user with given user_id user_ids = [ "87818409", # @guardian "807095", # @nytimes "428333", # @cnnbrk "5402612", # @BBCBreaking "2467791", # @washingtonpost "1020058453", # @BuzzFeedNews "23484039", # WSJbreakingnews "384438102", # ABCNewsLive "87416722", # SkyNewsBreak ] uids = [] for user in T.user_lookup(ids=user_ids): uids.append(user["id_str"]) assert set(user_ids) == set(uids) def test_user_lookup_by_screen_name(): # looks for the user with given screen_names screen_names = [ "guardian", "nytimes", "cnnbrk", "BBCBreaking", "washingtonpost", "BuzzFeedNews", "WSJbreakingnews", "ABCNewsLive", "SkyNewsBreak", ] names = [] for user in T.user_lookup(ids=screen_names, id_type="screen_name"): names.append(user["screen_name"].lower()) assert set(names) == set(map(lambda x: x.lower(), screen_names)) def test_tweet(): t = T.tweet("20") assert t["full_text"] == "just setting up my twttr" def test_dehydrate(): tweets = [ '{"text": "test tweet 1", "id_str": "800000000000000000"}', '{"text": "test tweet 2", "id_str": "800000000000000001"}', ] ids = list(T.dehydrate(iter(tweets))) assert len(ids) == 2 assert "800000000000000000" in ids assert "800000000000000001" in ids def test_hydrate(): ids = [ "501064188211765249", "501064196642340864", "501064197632167936", "501064196931330049", "501064198005481472", "501064198009655296", "501064198059597824", "501064198513000450", "501064180468682752", "501064199142117378", "501064171707170816", "501064200186118145", "501064200035516416", "501064201041743872", "501064201251880961", "501064198973960192", "501064201256071168", "501064202027798529", "501064202245521409", "501064201503113216", "501064202363359232", "501064202295848960", "501064202380115971", "501064202904403970", "501064203135102977", "501064203508412416", "501064203516407810", "501064203546148864", "501064203697156096", "501064204191690752", "501064204288540672", "501064197396914176", "501064194309906436", "501064204989001728", "501064204980592642", "501064204661850113", "501064205400039424", "501064205089665024", "501064206666702848", "501064207274868736", "501064197686296576", "501064207623000064", "501064207824351232", "501064208083980290", "501064208277319680", "501064208398573568", "501064202794971136", "501064208789045248", "501064209535614976", "501064209551994881", "501064141332029440", "501064207387742210", "501064210177331200", "501064210395037696", "501064210693230592", "501064210840035329", "501064211855069185", "501064192024006657", "501064200316125184", "501064205642903552", "501064212547137536", "501064205382848512", "501064213843169280", "501064208562135042", "501064214211870720", "501064214467731457", "501064215160172545", "501064209648848896", "501064215990648832", "501064216241897472", "501064215759568897", "501064211858870273", "501064216522932227", "501064216930160640", "501064217667960832", "501064211997274114", "501064212303446016", "501064213675012096", "501064218343661568", "501064213951823873", "501064219467341824", "501064219677044738", "501064210080473088", "501064220415229953", "501064220847656960", "501064222340423681", "501064222772445187", "501064222923440130", "501064220121632768", "501064222948593664", "501064224936714240", "501064225096499201", "501064225142624256", "501064225314185216", "501064225926561794", "501064226451259392", "501064226816143361", "501064227302674433", "501064227344646144", "501064227688558592", "501064228288364546", "501064228627705857", "501064229764751360", "501064229915729921", "501064231304065026", "501064231366983681", "501064231387947008", "501064231488200704", "501064231941570561", "501064232188665856", "501064232449114112", "501064232570724352", "501064232700350464", "501064233186893824", "501064233438568450", "501064233774510081", "501064235107897344", "619172347640201216", "619172347275116548", "619172341944332288", "619172340891578368", "619172338177843200", "619172335426244608", "619172332100284416", "619172331592773632", "619172331584376832", "619172331399725057", "619172328249757696", "619172328149118976", "619172326886674432", "619172324600745984", "619172323447324672", "619172321564098560", "619172320880533504", "619172320360333312", "619172319047647232", "619172314710609920", "619172313846693890", "619172312122814464", "619172306338709504", "619172304191401984", "619172303654518784", "619172302878408704", "619172300689031168", "619172298310840325", "619172295966392320", "619172293936291840", "619172293680345089", "619172285501456385", "619172282183725056", "619172281751711748", "619172281294655488", "619172278086070272", "619172275741298688", "619172274235535363", "619172257789706240", "619172257278111744", "619172253075378176", "619172242736308224", "619172236134588416", "619172235488718848", "619172232120692736", "619172227813126144", "619172221349662720", "619172216349917184", "619172214475108352", "619172209857327104", "619172208452182016", "619172208355749888", "619172193730199552", "619172193482768384", "619172184922042368", "619172182548049920", "619172179960328192", "619172175820357632", "619172174872469504", "619172173568053248", "619172170233679872", "619172165959708672", "619172163912908801", "619172162608463873", "619172158741303297", "619172157197819905", "501064235175399425", "501064235456401410", "615973042443956225", "618602288781860864", ] count = 0 for tweet in T.hydrate(iter(ids)): assert tweet["id_str"] count += 1 assert count > 80 # may need to adjust as these might get deleted @patch("twarc.client.OAuth1Session", autospec=True) def test_connection_error_get(oauth1session_class): mock_oauth1session = MagicMock(spec=OAuth1Session) mock_oauth1session.headers = {} oauth1session_class.return_value = mock_oauth1session mock_oauth1session.get.side_effect = requests.exceptions.ConnectionError t = twarc.Twarc( "consumer_key", "consumer_secret", "access_token", "access_token_secret", connection_errors=3, validate_keys=False, ) with pytest.raises(requests.exceptions.ConnectionError): t.get("https://api.twitter.com") assert 3 == mock_oauth1session.get.call_count @patch("twarc.client.OAuth1Session", autospec=True) def test_connection_error_post(oauth1session_class): mock_oauth1session = MagicMock(spec=OAuth1Session) mock_oauth1session.headers = {} oauth1session_class.return_value = mock_oauth1session mock_oauth1session.post.side_effect = requests.exceptions.ConnectionError t = twarc.Twarc( "consumer_key", "consumer_secret", "access_token", "access_token_secret", connection_errors=2, validate_keys=False, ) with pytest.raises(requests.exceptions.ConnectionError): t.post("https://api.twitter.com") assert 2 == mock_oauth1session.post.call_count def test_http_error_sample(): t = twarc.Twarc( "consumer_key", "consumer_secret", "access_token", "access_token_secret", http_errors=2, validate_keys=False, ) with pytest.raises(requests.exceptions.HTTPError): next(t.sample()) @pytest.mark.skip(reason="v1.1 filter API disabled March 2023") def test_http_error_filter(): t = twarc.Twarc( "consumer_key", "consumer_secret", "access_token", "access_token_secret", http_errors=3, validate_keys=False, ) with pytest.raises(requests.exceptions.HTTPError): next(t.filter(track="test")) def test_retweets(): # hopefully there will continue to be more than 100 retweets of these assert len(list(T.retweets(["20", "21"]))) > 100 def test_missing_retweets(): # this tweet doesn't exist and cannot have any retweets assert len(list(T.retweets(["795972820413140991"]))) == 0 def test_oembed(): t = next(T.search("obama")) url = "https://twitter.com/{}/status/{}".format( t["user"]["screen_name"], t["id_str"] ) tweet_json = T.oembed(url) assert url == tweet_json["url"] def test_oembed_params(): t = next(T.search("obama")) url = "https://twitter.com/{}/status/{}".format( t["user"]["screen_name"], t["id_str"] ) tweet_json = T.oembed(url, theme="dark") assert 'data-theme="dark"' in tweet_json["html"] def test_replies(): # this test will look at trending hashtags, and do a search # to find a popular tweet that uses it, and then makes a # big assumption that someone must have responded to the tweet # get the top hashtag that is trending trends = T.trends_place("1")[0]["trends"] trends.sort(key=lambda a: a["tweet_volume"] or 0, reverse=True) top_hashtag = trends[0]["name"].strip("#") logging.info("top hashtag %s" % top_hashtag) tries = 0 for top_tweet in T.search(top_hashtag, result_type="popular"): logging.info("testing %s" % top_tweet["id_str"]) # get replies to the top tweet replies = T.replies(top_tweet) # the first tweet should be the base tweet, or the tweet that # we are looking for replies to me = next(replies) assert me["id_str"] == top_tweet["id_str"] try: reply = next(replies) assert reply["in_reply_to_status_id_str"] == top_tweet["id_str"] break except StopIteration: pass # didn't find a reply tries += 1 if tries > 10: break def test_lists_members(): slug = "bots" screen_name = "edsu" members = list(T.list_members(slug=slug, owner_screen_name=screen_name)) assert len(members) > 0 assert members[0]["screen_name"] def test_lists_members_owner_id(): slug = "bots" owner_id = "14331818" members = list(T.list_members(slug=slug, owner_id=owner_id)) assert len(members) > 0 assert members[0]["screen_name"] def test_lists_list_id(): members = list(T.list_members(list_id="197880909")) assert len(members) > 0 assert members[0]["screen_name"] def test_extended_compat(): t_compat = twarc.Twarc(tweet_mode="compat") assert "full_text" in next(T.search("obama")) assert "text" in next(t_compat.search("obama")) assert "full_text" in next(T.timeline(screen_name="BarackObama")) assert "text" in next(t_compat.timeline(screen_name="BarackObama")) def test_csv_retweet(): for tweet in T.search("obama"): if "retweeted_status" in tweet: break text = json2csv.text(tweet) assert not text.startswith("RT @") def test_csv_retweet_hashtag(): toplevel_hashtags = 0 rt_hashtags = 0 for tweet in T.search("#auspol filter:nativeretweets filter:hashtags"): hashtag_rendered = json2csv.hashtags(tweet) if hashtag_rendered: hashtags = hashtag_rendered.split(" ") else: hashtags = [] if len(hashtags) > len(tweet["entities"]["hashtags"]): break else: assert False @pytest.mark.skip(reason="v1.1 filter API disabled March 2023") def test_truncated_text(): for tweet in T.filter("tweet"): if tweet["truncated"] == True: break assert tweet["text"] != tweet["extended_tweet"]["full_text"] assert json2csv.text(tweet) == tweet["extended_tweet"]["full_text"] def test_invalid_credentials(): old_consumer_key = T.consumer_key T.consumer_key = "Definitely not a valid key" with pytest.raises(RuntimeError): T.validate_keys() T.consumer_key = old_consumer_key def test_app_auth(): ta = twarc.Twarc(app_auth=True) count = 0 for tweet in ta.search("obama"): assert tweet["id_str"] count += 1 if count == 10: break assert count == 10 @pytest.mark.skipif(os.environ.get("TWITTER_ENV") == None, reason="No environment") def test_premium_30day_search(): twitter_env = os.environ["TWITTER_ENV"] t = twarc.Twarc(app_auth=True) now = datetime.date.today() then = now - datetime.timedelta(days=14) search = t.premium_search( q="blacklivesmatter", product="30day", environment=twitter_env, to_date=then, sandbox=True, ) tweet = next(search) assert tweet @pytest.mark.skipif(os.environ.get("TWITTER_ENV") == None, reason="No environment") def test_premium_fullarchive_search(): twitter_env = os.environ["TWITTER_ENV"] from_date = datetime.date(2013, 7, 1) to_date = datetime.date(2013, 8, 1) t = twarc.Twarc(app_auth=True) search = t.premium_search( q="blacklivesmatter", product="fullarchive", environment=twitter_env, from_date=from_date, to_date=to_date, sandbox=True, ) count = 0 for tweet in search: created_at = datetime.datetime.strptime( tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y" ) assert created_at.date() >= from_date assert created_at.date() <= to_date count += 1 assert count > 200 @pytest.mark.skipif(os.environ.get("GNIP_ENV") == None, reason="No gnip environment") def test_gnip_fullarchive_search(): twitter_env = os.environ["GNIP_ENV"] from_date = datetime.date(2013, 7, 1) to_date = datetime.date(2013, 8, 1) t = twarc.Twarc(gnip_auth=True) search = t.premium_search( q="blacklivesmatter", product="gnip_fullarchive", environment=twitter_env, from_date=from_date, to_date=to_date, sandbox=True, ) count = 0 for tweet in search: created_at = datetime.datetime.strptime( tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y" ) assert created_at.date() >= from_date assert created_at.date() <= to_date count += 1 assert count > 200 ================================================ FILE: test_twarc2.py ================================================ import os import pytz import twarc import dotenv import pytest import logging import pathlib import datetime import threading from unittest import TestCase from twarc.version import version, user_agent dotenv.load_dotenv() consumer_key = os.environ.get("CONSUMER_KEY") consumer_secret = os.environ.get("CONSUMER_SECRET") bearer_token = os.environ.get("BEARER_TOKEN") access_token = os.environ.get("ACCESS_TOKEN") access_token_secret = os.environ.get("ACCESS_TOKEN_SECRET") test_data = pathlib.Path("test-data") logging.basicConfig(filename="test.log", level=logging.INFO) # Implicitly test the constructor in application auth mode. This ensures that # the tests don't depend on test ordering, and allows using the pytest # functionality to only run a single test at a time. T = twarc.Twarc2( consumer_key=consumer_key, consumer_secret=consumer_secret, ) def test_version(): import setup assert setup.version == version assert user_agent assert f"twarc/{version}" in user_agent def test_auth_types_interaction(): """ Test the various options for configuration work as expected. """ # 1. bearer_token auth -> app auth tw = twarc.Twarc2(bearer_token=bearer_token) assert tw.auth_type == "application" for response in tw.user_lookup(range(1, 101)): assert response["data"] tw.client.close() # 2. consumer_keys tw = twarc.Twarc2(consumer_key=consumer_key, consumer_secret=consumer_secret) assert tw.auth_type == "application" for response in tw.user_lookup(range(1, 101)): assert response["data"] tw.client.close() # 3. Full user auth tw = twarc.Twarc2( access_token=access_token, access_token_secret=access_token_secret, consumer_key=consumer_key, consumer_secret=consumer_secret, ) assert tw.auth_type == "user" for response in tw.user_lookup(range(1, 101)): assert response["data"] tw.client.close() with pytest.raises(twarc.client2.InvalidAuthType): tw.sample() def test_sample(): # event to tell the filter stream to close event = threading.Event() for count, result in enumerate(T.sample(event=event)): assert int(result["data"]["id"]) # users are passed by reference an dincluded as includes user_id = result["data"]["author_id"] assert len(pick_id(user_id, result["includes"]["users"])) == 1 if count > 10: # close the sample event.set() assert count == 11 @pytest.mark.parametrize("sort_order", ["recency", "relevancy"]) def test_search_recent(sort_order): found_tweets = 0 pages = 0 for response_page in T.search_recent("politics", sort_order=sort_order): pages += 1 tweets = response_page["data"] found_tweets += len(tweets) if pages == 2: break assert 100 <= found_tweets <= 200 def test_counts_recent(): found_counts = 0 for response_page in T.counts_recent("twitter is:verified", granularity="day"): counts = response_page["data"] found_counts += len(counts) break assert 7 <= found_counts <= 8 @pytest.mark.skipif( os.environ.get("SKIP_ACADEMIC_PRODUCT_TRACK") != None, reason="No Academic Research Product Track access", ) def test_counts_empty_page(): found_counts = 0 for response_page in T.counts_all( "beans", start_time=datetime.datetime(2006, 3, 21), end_time=datetime.datetime(2006, 6, 1), granularity="day", ): counts = response_page["data"] found_counts += len(counts) assert found_counts == 72 def test_search_times(): found = False now = datetime.datetime.now(tz=pytz.timezone("Australia/Melbourne")) # twitter api doesn't resolve microseconds so strip them for comparison now = now.replace(microsecond=0) end = now - datetime.timedelta(seconds=60) start = now - datetime.timedelta(seconds=61) for response_page in T.search_recent("tweet", start_time=start, end_time=end): for tweet in response_page["data"]: found = True # convert created_at to datetime with utc timezone dt = tweet["created_at"].strip("Z") dt = datetime.datetime.fromisoformat(dt) dt = dt.replace(tzinfo=datetime.timezone.utc) assert dt >= start assert dt <= end assert found def test_user_ids_lookup(): users_found = 0 users_not_found = 0 for response in T.user_lookup(range(1, 1000)): for profile in response["data"]: users_found += 1 for error in response["errors"]: # Note that errors includes lookup of contained entitites within a # tweet, so a pinned tweet that doesn't exist anymore results in an # additional error entry, even if the profile is present. if error["resource_type"] == "user": users_not_found += 1 assert users_found >= 1 assert users_found + users_not_found == 999 def test_usernames_lookup(): users_found = 0 usernames = ["jack", "barackobama", "rihanna"] for response in T.user_lookup(usernames, usernames=True): for profile in response["data"]: users_found += 1 assert users_found == 3 def test_tweet_lookup(): tweets_found = 0 tweets_not_found = 0 for response in T.tweet_lookup(range(1000, 2000)): for tweet in response["data"]: tweets_found += 1 for error in response["errors"]: # Note that errors includes lookup of contained entitites within a # tweet, so a pinned tweet that doesn't exist anymore results in an # additional error entry, even if the profile is present. if error["resource_type"] == "tweet": tweets_not_found += 1 assert tweets_found >= 1 assert tweets_found + tweets_not_found == 1000 # Alas, fetching the stream in GitHub action yields a 400 HTTP error # maybe this will go away since it used to work fine. @pytest.mark.skipif( os.environ.get("GITHUB_ACTIONS") != None, reason="stream() seems to throw a 400 error under GitHub Actions?!", ) def test_stream(): # remove any active stream rules rules = T.get_stream_rules() if "data" in rules and len(rules["data"]) > 0: rule_ids = [r["id"] for r in rules["data"]] T.delete_stream_rule_ids(rule_ids) # make sure they are empty rules = T.get_stream_rules() assert "data" not in rules # add two rules rules = T.add_stream_rules( [{"value": "hey", "tag": "twarc-test"}, {"value": "joe", "tag": "twarc-test"}] ) assert len(rules["data"]) == 2 # make sure they are there rules = T.get_stream_rules() assert len(rules["data"]) == 2 # these properties should be set assert rules["data"][0]["id"] assert rules["data"][0]["tag"] == "twarc-test" assert rules["data"][1]["id"] assert rules["data"][1]["tag"] == "twarc-test" # the order of the values is not guaranteed assert "hey" in [r["value"] for r in rules["data"]] assert "joe" in [r["value"] for r in rules["data"]] # collect some data event = threading.Event() for count, result in enumerate(T.stream(event=event)): assert result["data"]["id"] assert result["data"]["text"] assert len(result["matching_rules"]) > 0 for rule in result["matching_rules"]: assert rule["id"] assert rule["tag"] == "twarc-test" if count > 25: event.set() assert count > 25 # delete the rules rule_ids = [r["id"] for r in rules["data"]] T.delete_stream_rule_ids(rule_ids) # make sure they are gone rules = T.get_stream_rules() assert "data" not in rules def test_timeline(): """ Test the user timeline endpoints. """ # get @jack's first pages of tweets and mentions found = 0 for pages, tweets in enumerate(T.timeline(12)): found += len(tweets["data"]) if pages == 3: break assert found >= 200 found = 0 for pages, tweets in enumerate(T.mentions(12)): found += len(tweets["data"]) if pages == 3: break assert found >= 200 def test_timeline_username(): """ Test the user timeline endpoints with username. """ found = 0 for pages, tweets in enumerate(T.timeline("jack")): found += len(tweets["data"]) if pages == 3: break assert found >= 200 found = 0 for pages, tweets in enumerate(T.mentions("jack")): found += len(tweets["data"]) if pages == 3: break assert found >= 200 def test_missing_timeline(): results = T.timeline(1033441111677788160) assert len(list(results)) == 0 def test_follows(): """ Test followers and and following. """ found = 0 for pages, users in enumerate(T.following(12)): pages += 1 found += len(users["data"]) if pages == 2: break assert found >= 1000 found = 0 for pages, users in enumerate(T.followers(12)): found += len(users["data"]) if pages == 2: break assert found >= 1000 def test_follows_username(): """ Test followers and and following by username. """ found = 0 for pages, users in enumerate(T.following("jack")): pages += 1 found += len(users["data"]) if pages == 2: break assert found >= 1000 found = 0 for pages, users in enumerate(T.followers("jack")): found += len(users["data"]) if pages == 2: break assert found >= 1000 def test_flattened(): """ This test uses the search API to test response flattening. It will look at each tweet to find evidence that all the expansions have worked. Once it finds them all it stops. If it has retrieved 500 tweets and not found any of the expansions it stops and assumes that something is not right. This 500 cutoff or the query may need to be adjusted based on experience. """ found_geo = False found_in_reply_to_user = False found_attachments_media = False found_attachments_polls = False found_entities_mentions = False found_referenced_tweets = False count = 0 for response in T.search_recent( "(vote poll has:hashtags has:mentions -is:retweet) OR (checked into has:images -is:retweet)" ): # Search api always returns a response of tweets with metadata but flatten # will put these in a list tweets = twarc.expansions.flatten(response) assert len(tweets) > 1 for tweet in tweets: count += 1 assert "id" in tweet logging.info("got search tweet #%s %s", count, tweet["id"]) author_id = tweet["author_id"] assert "author" in tweet assert tweet["author"]["id"] == author_id if "in_reply_to_user_id" in tweet: assert "in_reply_to_user" in tweet found_in_reply_to_user = True if "attachments" in tweet: if "media_keys" in tweet["attachments"]: assert "media" in tweet["attachments"] assert tweet["attachments"]["media"] assert tweet["attachments"]["media"][0]["width"] found_attachments_media = True if "poll_ids" in tweet["attachments"]: assert "poll" in tweet["attachments"] assert tweet["attachments"]["poll"] found_attachments_polls = True if "geo" in tweet: assert tweet["geo"]["place_id"] assert tweet["geo"]["place_id"] == tweet["geo"]["id"] found_geo = True if "entities" in tweet and "mentions" in tweet["entities"]: assert tweet["entities"]["mentions"][0]["username"] found_entities_mentions = True # need to ensure there are no errors because a referenced tweet # might be protected or deleted in which case it would not have been # included in the response and would not have been flattened if "errors" not in response and "referenced_tweets" in tweet: assert tweet["referenced_tweets"][0]["text"] found_referenced_tweets = True if ( found_geo and found_in_reply_to_user and found_attachments_media and found_attachments_polls and found_entities_mentions and found_referenced_tweets ): logging.info("found all expansions!") elif count > 10000: logging.info("didn't find all expansions in 10000 tweets") assert found_geo, "found geo" assert found_in_reply_to_user, "found in_reply_to_user" assert found_attachments_media, "found media" assert found_attachments_polls, "found polls" assert found_entities_mentions, "found mentions" assert found_referenced_tweets, "found referenced tweets" def test_ensure_flattened(): resp = next(T.search_recent("twitter", max_results=20)) # flatten a response flat1 = twarc.expansions.ensure_flattened(resp) assert isinstance(flat1, list) assert len(flat1) > 1 assert "author" in flat1[0] # flatten the flattened list flat2 = twarc.expansions.ensure_flattened(flat1) assert isinstance(flat2, list) assert len(flat2) == len(flat1) assert "author" in flat2[0] # flatten a tweet object which will force it into a list flat3 = twarc.expansions.ensure_flattened(flat2[0]) assert isinstance(flat3, list) assert len(flat3) == 1 # flatten an object without includes: # List of records, data is a dict: flat4 = twarc.expansions.ensure_flattened([{"data": {"fake": "tweet"}}]) assert isinstance(flat4, list) assert len(flat4) == 1 # 1 record, data is a dict: flat5 = twarc.expansions.ensure_flattened({"data": {"fake": "tweet"}}) assert isinstance(flat5, list) assert len(flat5) == 1 # List of records, data is a list: flat6 = twarc.expansions.ensure_flattened([{"data": [{"fake": "tweet"}]}]) assert isinstance(flat6, list) assert len(flat6) == 1 # 1 record, data is a list: flat7 = twarc.expansions.ensure_flattened({"data": [{"fake": "tweet"}]}) assert isinstance(flat7, list) assert len(flat7) == 1 TestCase().assertDictEqual(flat4[0], flat5[0]) TestCase().assertDictEqual(flat6[0], flat7[0]) TestCase().assertDictEqual(flat4[0], flat7[0]) resp.pop("includes") flat8 = twarc.expansions.ensure_flattened(resp) assert len(flat8) > 1 # Flatten worked without includes, wrote empty object: assert "author" in flat8[0] TestCase().assertDictEqual(flat8[0]["author"], {}) # If there's some other type of data: with pytest.raises(ValueError): twarc.expansions.ensure_flattened([[{"data": {"fake": "list_of_lists"}}]]) def test_ensure_flattened_errors(): """ Test that ensure_flattened doesn't return tweets for API responses that only contain errors. """ data = {"errors": ["fake error"]} assert twarc.expansions.ensure_flattened(data) == [] def test_ensure_user_id(): """ Test _ensure_user_id's ability to discriminate correctly between IDs and screen names. """ # presumably IDs don't change assert T._ensure_user_id("jack") == "12" # should hold for all users, even if the screen name exists assert T._ensure_user_id("12") == "12" # this is a screen name but not an ID # would help to find more "stable" example? assert T._ensure_user_id("42069") == "17334495" # should 42069 passed as int return ID or screen name? assert T._ensure_user_id("1033441111677788160") == "1033441111677788160" assert T._ensure_user_id(1033441111677788160) == "1033441111677788160" def test_liking_users(): # This is one of @jack's tweets about the Twitter API likes = T.liking_users(1460417326130421765) like_count = 0 for page in likes: assert "data" in page # These should be user objects. assert "description" in page["data"][0] like_count += len(page["data"]) if like_count > 300: break def test_retweeted_by(): # This is one of @jack's tweets about the Twitter API retweet_users = T.retweeted_by(1460417326130421765) retweet_count = 0 for page in retweet_users: assert "data" in page # These should be user objects. assert "description" in page["data"][0] retweet_count += len(page["data"]) if retweet_count > 150: break def test_liked_tweets(): # What has @jack liked? liked_tweets = T.liked_tweets(12) like_count = 0 for page in liked_tweets: assert "data" in page # These should be tweet objects. assert "text" in page["data"][0] like_count += len(page["data"]) if like_count > 300: break def test_list_lookup(): parks_list = T.list_lookup(715919216927322112) assert "data" in parks_list assert parks_list["data"]["name"] == "National-parks" def test_list_members(): response = list(T.list_members(715919216927322112)) assert len(response) == 1 members = twarc.expansions.flatten(response[0]) assert len(members) == 8 def test_list_followers(): response = list(T.list_followers(715919216927322112)) assert len(response) >= 2 followers = twarc.expansions.flatten(response[0]) assert len(followers) > 50 def test_list_memberships(): response = list(T.list_memberships("64flavors")) assert len(response) == 1 lists = twarc.expansions.flatten(response[0]) assert len(lists) >= 9 def test_followed_lists(): response = list(T.followed_lists("nasa")) assert len(response) == 1 lists = twarc.expansions.flatten(response[0]) assert len(lists) >= 1 def test_owned_lists(): response = list(T.owned_lists("nasa")) assert len(response) >= 1 lists = twarc.expansions.flatten(response[0]) assert len(lists) >= 11 def test_list_tweets(): response = next(T.list_tweets(715919216927322112)) assert "data" in response tweets = twarc.expansions.flatten(response) assert len(tweets) >= 90 def test_user_lookup_non_existent(): with pytest.raises(ValueError): # This user does not exist, and a value error should be raised T._ensure_user("noasdfasdf") def test_twarc_metadata(): # With metadata (default) event = threading.Event() for i, response in enumerate(T.sample(event=event)): assert "__twarc" in response if i == 10: event.set() for response in T.tweet_lookup(range(1000, 2000)): assert "__twarc" in response assert "__twarc" in twarc.expansions.flatten(response)[0] # Witout metadata T.metadata = False event = threading.Event() for i, response in enumerate(T.sample(event=event)): assert "__twarc" not in response if i == 10: event.set() for response in T.tweet_lookup(range(1000, 2000)): assert "__twarc" not in response T.metadata = True def test_docs_requirements(): """ Make sure that the mkdocs requirements has everything that is in the twarc requirements so the readthedocs build doesn't fail. """ twarc_reqs = set(open("requirements.txt").read().split()) mkdocs_reqs = set(open("requirements-mkdocs.txt").read().split()) assert twarc_reqs.issubset(mkdocs_reqs) def test_geo(): print(T.geo(query="Silver Spring")) def pick_id(id, objects): """pick an object out of a list of objects using its id""" return list(filter(lambda o: o["id"] == id, objects)) ================================================ FILE: utils/auth_timing.py ================================================ #!/usr/bin/env python3 """ Twitter's rate limits allow App Auth contexts to search at 450 requests every 15 minutes, and User Auth contexts at 180 requests per 15 minutes. This script exercises both contexts and counts how tweets it is able to receive. We should see a significant number more tweets coming back for App Auth. Typical output should look like: app auth: 44999 user auth: 18000 https://developer.twitter.com/en/docs/basics/rate-limits """ import logging from twarc import Twarc from datetime import datetime from datetime import timedelta logging.basicConfig( filename="time_test.log", level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", ) def count_tweets(app_auth): """ Search for covid_19 in tweets using the given context and return the number of tweets that were fetched in 10 minutes. """ count = 0 t = Twarc(app_auth=app_auth) start = None for tweet in t.search("covid_19"): # start the timer when we get the first tweet if start is None: start = datetime.now() count += 1 if datetime.now() - start > timedelta(minutes=10): break t.client.close() return count print("app auth: ", count_tweets(app_auth=True)) print("user auth: ", count_tweets(app_auth=False)) ================================================ FILE: utils/deduplicate.py ================================================ #!/usr/bin/env python """ Given a JSON file, remove any tweets with duplicate IDs. Optionally, this will extract retweets. (That is, for a retweet use tweet from retweeted_status and retweet.) Example usage: utils/deduplicate.py tweets.jsonl > tweets_deduped.jsonl """ from __future__ import print_function import json import fileinput import argparse def main(files, extract_retweets=False): seen = {} for line in fileinput.input(files=files): tweet = json.loads(line) if extract_retweets and "retweeted_status" in tweet: tweet = tweet["retweeted_status"] id = tweet["id"] if id not in seen: seen[id] = True print(json.dumps(tweet)) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--extract-retweets", action="store_true", help="Extract retweets" ) parser.add_argument( "files", metavar="FILE", nargs="*", help="files to read, if empty, stdin is used", ) args = parser.parse_args() main( args.files if len(args.files) > 0 else ("-",), extract_retweets=args.extract_retweets, ) ================================================ FILE: utils/deleted.py ================================================ #!/usr/bin/env python """ This is a little utility that reads in tweets, rehydrates them, and only outputs the tweets JSON for tweets that are no longer available. """ import json import twarc import fileinput t = twarc.Twarc() def missing(tweets): tweet_ids = [t["id_str"] for t in tweets] hydrated = t.hydrate(tweets) hydrated_ids = [t["id_str"] for t in hydrated] missing_ids = tweet_ids - hydrated_ids for t in tweets: if t["id_str"] in missing_ids: yield t tweets = [] for line in fileinput.input(): t = json.loads(line) tweets.append(t) if len(tweets) > 100: for t in missing(tweets): print(json.dumps(t)) tweets = [] if len(tweets) > 0: for t in missing(tweets): print(json.dumps(t)) ================================================ FILE: utils/deleted_users.py ================================================ #!/usr/bin/env python3 """ This utility Will read in user ids, or tweet JSON data, and look up each user_id. If the user no longer exists the user_id or tweet JSON will be written to stdout. If the user exists no output will be written. It acts like a filter to locate deleted accounts. """ import re import json import twarc import logging import fileinput logging.basicConfig(filename="deleted_users.log", level=logging.INFO) t = twarc.Twarc() for line in fileinput.input(): line = line.strip() if re.match("^\d+$", line): user_id = line else: tweet = json.loads(line) user_id = tweet["user"]["id_str"] try: user = next(t.user_lookup([user_id])) except Exception as e: print(line) ================================================ FILE: utils/deletes.py ================================================ #!/usr/bin/env python3 """ This program assumes that you are feeding it tweet JSON data for tweets that have been deleted. It will use the metadata and the API to analyze why each tweet appears to have been deleted. Note that lookups are based on user id, so may give different results than looking up a user by screen name. """ import json import fileinput import collections import requests import twarc import argparse import logging USER_OK = "USER_OK" USER_DELETED = "USER_DELETED" USER_PROTECTED = "USER_PROTECTED" USER_SUSPENDED = "USER_SUSPENDED" TWEET_OK = "TWEET_OK" TWEET_DELETED = "TWEET_DELETED" # You have been blocked by the user. TWEET_BLOCKED = "TWEET_BLOCKED" RETWEET_DELETED = "RETWEET_DELETED" ORIGINAL_TWEET_DELETED = "ORIGINAL_TWEET_DELETED" ORIGINAL_TWEET_BLOCKED = "ORIGINAL_TWEET_BLOCKED" ORIGINAL_USER_DELETED = "ORIGINAL_USER_DELETED" ORIGINAL_USER_PROTECTED = "ORIGINAL_USER_PROTECTED" ORIGINAL_USER_SUSPENDED = "ORIGINAL_USER_SUSPENDED" # twarc instance t = None def main(files, enhance_tweet=False, print_results=True, profile=None): global t if profile is not None: t = twarc.Twarc(profile=profile) else: t = twarc.Twarc() counts = collections.Counter() for count, line in enumerate(fileinput.input(files=files)): if count % 10000 == 0: logging.info("processed {:,} tweets".format(count)) tweet = json.loads(line) result = examine(tweet) if enhance_tweet: tweet["delete_reason"] = result print(json.dumps(tweet)) else: print(tweet_url(tweet), result) counts[result] += 1 if print_results: for result, count in counts.most_common(): print(result, count) def examine(tweet): user_status = get_user_status(tweet) # Go with user status first (suspended, protected, deleted) if user_status != USER_OK: return user_status else: retweet = tweet.get("retweeted_status", None) tweet_status = get_tweet_status(tweet) # If not a retweet and tweet deleted, then tweet deleted. if tweet_status == TWEET_OK: return TWEET_OK elif retweet is None or tweet_status == TWEET_BLOCKED: return tweet_status else: rt_status = examine(retweet) if rt_status == USER_DELETED: return ORIGINAL_USER_DELETED elif rt_status == USER_PROTECTED: return ORIGINAL_USER_PROTECTED elif rt_status == USER_SUSPENDED: return ORIGINAL_USER_SUSPENDED elif rt_status == TWEET_DELETED: return ORIGINAL_TWEET_DELETED elif rt_status == TWEET_BLOCKED: return ORIGINAL_TWEET_BLOCKED elif rt_status == TWEET_OK: return RETWEET_DELETED else: raise "Unexpected retweet status %s for %s" % ( rt_status, tweet["id_str"], ) users = {} def get_user_status(tweet): user_id = tweet["user"]["id_str"] if user_id in users: return users[user_id] url = "https://api.twitter.com/1.1/users/show.json" params = {"user_id": user_id} # USER_DELETED: 404 and {"errors": [{"code": 50, "message": "User not found."}]} # USER_PROTECTED: 200 and user object with "protected": true # USER_SUSPENDED: 403 and {"errors":[{"code":63,"message":"User has been suspended."}]} result = USER_OK try: resp = t.get(url, params=params, allow_404=True) user = resp.json() if user["protected"]: result = USER_PROTECTED except requests.exceptions.HTTPError as e: try: resp_json = e.response.json() except json.decoder.JSONDecodeError: raise e if e.response.status_code == 404 and has_error_code(resp_json, 50): result = USER_DELETED elif e.response.status_code == 403 and has_error_code(resp_json, 63): result = USER_SUSPENDED else: raise e users[user_id] = result return result tweets = {} def get_tweet_status(tweet): id = tweet["id_str"] if id in tweets: return tweets[id] # USER_SUSPENDED: 403 and {"errors":[{"code":63,"message":"User has been suspended."}]} # USER_PROTECTED: 403 and {"errors":[{"code":179,"message":"Sorry, you are not authorized to see this status."}]} # TWEET_DELETED: 404 and {"errors":[{"code":144,"message":"No status found with that ID."}]} # or {"errors":[{"code":34,"message":"Sorry, that page does not exist."}]} url = "https://api.twitter.com/1.1/statuses/show.json" params = {"id": id} result = TWEET_OK try: t.get(url, params=params, allow_404=True) except requests.exceptions.HTTPError as e: try: resp_json = e.response.json() except json.decoder.JSONDecodeError: raise e if e.response.status_code == 404 and has_error_code(resp_json, (34, 144)): result = TWEET_DELETED elif e.response.status_code == 403 and has_error_code(resp_json, 63): result = USER_SUSPENDED elif e.response.status_code == 403 and has_error_code(resp_json, 179): result = USER_PROTECTED elif e.response.status_code == 401 and has_error_code(resp_json, 136): result = TWEET_BLOCKED else: raise e tweets[id] = result return result def tweet_url(tweet): return "https://twitter.com/%s/status/%s" % ( tweet["user"]["screen_name"], tweet["id_str"], ) def has_error_code(resp, code): if isinstance(code, int): code = (code,) for error in resp["errors"]: if error["code"] in code: return True return False if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--enhance", action="store_true", help="Enhance tweet with delete_reason and output enhanced tweet.", ) parser.add_argument( "--skip-results", action="store_true", help="Skip outputting delete reason summary", ) parser.add_argument("--profile", help="The twarc API profile to use") parser.add_argument( "files", metavar="FILE", nargs="*", help="files to read, if empty, stdin is used", ) args = parser.parse_args() main( args.files if len(args.files) > 0 else ("-",), enhance_tweet=args.enhance, print_results=not args.skip_results and not args.enhance, profile=args.profile, ) ================================================ FILE: utils/embeds.py ================================================ #!/usr/bin/env python from __future__ import print_function import json import fileinput for line in fileinput.input(): tweet = json.loads(line) if "media" in tweet["entities"]: for media in tweet["entities"]["media"]: print(media["media_url"]) ================================================ FILE: utils/emojis.py ================================================ #!/usr/bin/env python3 import re import json import fileinput import collections import optparse import emoji opt_parser = optparse.OptionParser() opt_parser.add_option("-n", "--number", dest="number", type="int", default=10) options, args = opt_parser.parse_args() tweets = args number_of_emojis = options.number tweets = tweets.pop() counts = collections.Counter() EMOJI_RE = emoji.get_emoji_regexp() for line in open(tweets): tweet = json.loads(line) if "full_text" in tweet: text = tweet["full_text"] else: text = tweet["text"] for char in EMOJI_RE.findall(text): counts[char] += 1 for char, count in counts.most_common(number_of_emojis): print("%s %5i" % (char, count)) ================================================ FILE: utils/extractor.py ================================================ #!/usr/bin/env python3 from datetime import datetime import json import os import re import argparse import csv import copy import sys import gzip strptime = datetime.strptime class attriObject: """Class object for attribute parser.""" def __init__(self, string): self.value = re.split(":", string) self.title = self.value[-1] def getElement(self, json_object): found = [json_object] for entry in self.value: for index in range(len(found)): try: found[index] = found[index][entry] except (TypeError, KeyError): print( "'{0}' is not a valid json entry.".format(":".join(self.value)) ) sys.exit() # If single search object is a list, search entire list. Error if nested lists. if isinstance(found[index], list): if len(found) > 1: raise Exception( "Extractor currently does not handle nested lists." ) found = found[index] return found def tweets_files(string, path): """Iterates over json files in path.""" for filename in os.listdir(path): if re.match(string, filename) and ".jsonl" in filename: f = gzip.open if ".gz" in filename else open yield path + filename, f Ellipsis def parse(args): with open(args.output, "w+", encoding="utf-8") as output: csv_writer = csv.writer(output, dialect=args.dialect) csv_writer.writerow([a.title for a in args.attributes]) count = 0 tweets = set() for filename, f in tweets_files(args.string, args.path): print("parsing", filename) with f(filename, "rb") as data_file: for line in data_file: try: json_object = json.loads(line.decode("utf-8")) except ValueError: print("Error in", filename, "entry incomplete.") continue # Check for duplicates identity = json_object["id"] if identity in tweets: continue tweets.add(identity) # Check for time restrictions. if args.start or args.end: tweet_time = strptime( json_object["created_at"], "%a %b %d %H:%M:%S +0000 %Y" ) if args.start and args.start > tweet_time: continue if args.end and args.end < tweet_time: continue # Check for hashtag. if args.hashtag: for entity in json_object["entities"]["hashtags"]: if entity["text"].lower() == args.hashtag: break else: continue count += extract(json_object, args, csv_writer) print("Searched", len(tweets), "tweets and recorded", count, "items.") print("largest id:", max(tweets)) def extract(json_object, args, csv_writer): """Extract and write found attributes.""" found = [[]] for attribute in args.attributes: item = attribute.getElement(json_object) if len(item) == 0: for row in found: row.append("NA") else: found1 = [] for value in item: if value is None: value = "NA" new = copy.deepcopy(found) for row in new: row.append(value) found1.extend(new) found = found1 for row in found: csv_writer.writerow(row) return len(found) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Extracts attributes from tweets.") parser.add_argument( "attributes", nargs="*", help="Attributes to search for. Attributes inside nested inside other attributes should be seperated by a colon. Example: user:screen_name, entities:hashtags:text.", ) parser.add_argument( "-dialect", default="excel", help="Sets dialect for csv output. Defaults to excel. See python module csv.list_dialects()", ) parser.add_argument( "-string", default="", help="Regular expression for files to parse. Defaults to empty string.", ) parser.add_argument( "-path", default="./", help="Optional path to folder containing tweets. Defaults to current folder.", ) parser.add_argument( "-output", default="output.csv", help="Optional file to output results. Defaults to output.csv.", ) parser.add_argument( "-start", default="", help="Define start date for tweets. Format (mm:dd:yyyy)" ) parser.add_argument( "-end", default="", help="Define end date for tweets. Format (mm:dd:yyyy)" ) parser.add_argument( "-hashtag", default="", help="Define a hashtag that must be in parsed tweets." ) args = parser.parse_args() if not args.path.endswith("/"): args.path += "/" args.start = strptime(args.start, "%m:%d:%Y") if args.start else False args.end = strptime(args.end, "%m:%d:%Y") if args.end else False args.attributes = [attriObject(i) for i in args.attributes] args.string = re.compile(args.string) args.hashtag = args.hashtag.lower() parse(args) ================================================ FILE: utils/filter_date.py ================================================ #!/usr/bin/env python """ Given a minimum and/or maximum date, filter out all tweets after this date. For example, if a hashtag was used for another event before the one you're interested in, you can filter out the old ones. Example usage: utils/filter_date.py --mindate 1-may-2014 tweets.jsonl > filtered.jsonl """ from __future__ import print_function import sys import json import fileinput import argparse import datetime from dateutil.parser import parse def filter_input(mindate, maxdate, files): mindate = parse(mindate) if mindate is not None else datetime.datetime.min maxdate = parse(maxdate) if maxdate is not None else datetime.datetime.max for line in fileinput.input(files): tweet = json.loads(line) created_at = parse(tweet["created_at"]) created_at = created_at.replace(tzinfo=None) if mindate < created_at and maxdate > created_at: print(json.dumps(tweet)) def main(): parser = argparse.ArgumentParser() parser.add_argument("--mindate", help="the minimum date", default=None) parser.add_argument("--maxdate", help="the maximum date", default=None) parser.add_argument("files", nargs="?", default=[]) args = parser.parse_args() filter_input(args.mindate, args.maxdate, args.files) if __name__ == "__main__": main() ================================================ FILE: utils/filter_users.py ================================================ #!/usr/bin/env python """ Filters tweets posted by a list of users. The list is supplied in a file. The file can contain: * screen names * user ids * screen name,user id * user id,screen name where each appears on a separate line. When a user id is provided, it will be used. Otherwise, screen name will be used. There is also an option to filter by tweets NOT posted by the list of users. """ import argparse import fileinput import json import logging def read_user_list_file(user_list_filepath): screen_names = set() user_ids = set() with open(user_list_filepath) as f: for count, line in enumerate(f): split_line = line.rstrip("\n\r").split(",") if _is_header(count, split_line): continue if split_line[0].isdigit(): user_ids.add(split_line[0]) else: screen_names.add(split_line[0]) if len(split_line) > 1 and split_line[1].isdigit(): user_ids.add(split_line[1]) assert screen_names or user_ids return user_ids, screen_names def _is_header(count, split_line): # If this is first line and there is more than one part and none are all digit, then a header if count == 0: for part in split_line: if part.isdigit(): return False return True return False def main(files, user_ids, screen_names, positive_match=True): for count, line in enumerate(fileinput.input(files=files)): try: tweet = json.loads(line.rstrip("\n")) match = False if user_ids and tweet["user"]["id_str"] in user_ids: match = True elif tweet["user"]["screen_name"] in screen_names: match = True if not positive_match: match = not match if match: print(line.rstrip("\n")) if count % 100000 == 0: logging.info("processed {:,} tweets".format(count)) except json.decoder.JSONDecodeError: pass if __name__ == "__main__": logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s" ) parser = argparse.ArgumentParser() parser.add_argument( "--neg-match", action="store_true", help="Return tweets that do not match users" ) parser.add_argument( "user_list_file", help="file containing list of users to filter tweets by" ) parser.add_argument( "tweet_files", metavar="FILE", nargs="*", help="file containing tweets to filter, if empty, " "stdin is used", ) args = parser.parse_args() m_user_ids, m_screen_names = read_user_list_file(args.user_list_file) main( args.tweet_files if len(args.tweet_files) > 0 else ("-",), m_user_ids, m_screen_names, positive_match=not args.neg_match, ) ================================================ FILE: utils/flakey.py ================================================ #!/usr/bin/env python3 # # This program will read tweet ids (Snowflake IDs) from a file or a pipe and # write the tweet ids back out again with their extracted creation time # (RFC 3339) as csv. # # usage: flakey.py ids.txt > ids-times.csv # # For more about Snowflake IDs see: # https://ws-dl.blogspot.com/2019/08/2019-08-03-tweetedat-finding-tweet.html # import fileinput from datetime import datetime def id2time(tweet_id): ms = (tweet_id >> 22) + 1288834974657 dt = datetime.utcfromtimestamp(ms // 1000) return dt.replace(microsecond=ms % 1000 * 1000) print("id,created_at") for line in fileinput.input(): tweet_id = int(line) created_at = id2time(tweet_id).strftime("%Y-%m-%dT%H:%M:%S.%f")[0:-3] + "Z" print("{},{}".format(tweet_id, created_at)) ================================================ FILE: utils/foaf.py ================================================ #!/usr/bin/env python3 """ This is a utility for getting the friend-of-a-friend network for a given twitter user. It writes a sqlite database as it collects the data {user-id}.sqlite and once complete it exports that data to two csv files: * {user-id}.csv - the user id links * {user-id}-users.csv - metadata about the users keyed off their user id """ import re import csv import sys import twarc import logging import sqlite3 import argparse import requests from dateutil.parser import parse as parse_datetime logging.basicConfig( filename="foaf.log", level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", ) def friendships(user_id, level=2): """ Pass in a user_id and you will be returned a generator of friendship tuples (user_id, friend_id). By default it will return the friend of a friend network (level=2), but you can expand this by settings the level parameter to either another number. But beware, it could run for a while! """ logging.info("getting friends for user %s", user_id) level -= 1 try: count = 0 for friend_id in t.friend_ids(user_id): count += 1 add_friendship(user_id, friend_id) yield (user_id, friend_id) if level > 0: if not user_in_db(friend_id): yield from friendships(friend_id, level) else: logging.info("already collected %s", friend_id) if count % 1000 == 0: db.commit() except requests.exceptions.HTTPError as e: if e.response.status_code == 401: logging.error("can't get friends for protected user %s", user_id) else: raise (e) def user_ids(): """ Returns all the Twitter user_ids in the database. """ sql = """ SELECT DISTINCT(user_id) AS user_id FROM friends UNION SELECT DISTINCT(friend_id) AS user_id FROM friends """ for result in db.execute(sql): yield str(result[0]) def user_in_db(user_id): """ Checks to see if the user's friends have already been collected. """ results = db.execute("SELECT COUNT(*) FROM friends where user_id = ?", [user_id]) return results.fetchone()[0] > 0 def add_friendship(user_id, friend_id): """ Add a friendship to the database. """ db.execute( "INSERT INTO friends (user_id, friend_id) VALUES (?, ?)", [user_id, friend_id] ) def add_user(u): """ Add a user to the database. """ db.execute( """ INSERT INTO users ( user_id, screen_name, name, description, location, created, statuses, verified ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) """, [ u["id"], u["screen_name"], u["name"], u["description"], u["location"], parse_datetime(u["created_at"]).strftime("%Y-%m-%d %H:%M:%S"), u["statuses_count"], u["verified"], ], ) # get command line arguments parser = argparse.ArgumentParser("tweet.py") parser.add_argument("user", action="store", help="user_id") parser.add_argument( "--level", type=int, action="store", default=2, help="how far out into the social graph to follow", ) args = parser.parse_args() # create twarc instance for querying Twitter t = twarc.Twarc() # get the seed user_id, potentially from their screen name if re.match("^\d+$", args.user): seed_user_id = args.user else: seed_user_id = next(t.user_lookup([args.user]))["id_str"] # setup sqlite db for storing information as it is collected db = sqlite3.connect(f"{seed_user_id}.sqlite3") db.execute( """ CREATE TABLE IF NOT EXISTS friends ( user_id INT, friend_id INT, PRIMARY KEY (user_id, friend_id) ) """ ) db.execute( """ CREATE TABLE IF NOT EXISTS users ( user_id INT, screen_name TEXT, name TEXT, description TEXT, location TEXT, created TEXT, statuses INT, verified TEXT, PRIMARY KEY (user_id) ) """ ) # lookup friendship data for friendship in friendships(seed_user_id, args.level): print("%s,%s" % friendship) # lookup user metadata for user in t.user_lookup(user_ids()): add_user(user) db.commit() # write out friendships with open("{}.csv".format(seed_user_id), "w") as fh: w = csv.writer(fh) w.writerow(["user_id", "friend_user_id"]) for row in db.execute("SELECT * FROM friends"): w.writerow(row) # write out user data as csv with open("{}-users.csv".format(seed_user_id), "w") as fh: w = csv.writer(fh) w.writerow( [ "user_id", "screen_name", "name", "description", "location", "created", "statuses", "verified", ] ) sql = """ SELECT user_id, screen_name, name, description, location, created, statuses, verified FROM users """ for row in db.execute(sql): w.writerow(row) ================================================ FILE: utils/gender.py ================================================ #!/usr/bin/env python """ filters tweets based on a guess about the users gender """ from __future__ import print_function import json import optparse import fileinput from genderator.detector import Detector, MALE, FEMALE, ANDROGYNOUS usage = "usage: gender.py --gender [male|female|unknown] tweet_file *" opt_parser = optparse.OptionParser(usage=usage) opt_parser.add_option( "-g", "--gender", dest="gender", choices=["male", "female", "unknown"], action="store", ) options, args = opt_parser.parse_args() if not options.gender: opt_parser.error("must supply --gender") d = Detector() for line in fileinput.input(args): line = line.strip() tweet = json.loads(line) name = tweet["user"]["name"] first_name = name.split(" ")[0] gender = d.getGender(first_name) if options.gender == "male" and gender == MALE: print(line.encode("utf-8")) elif options.gender == "female" and gender == FEMALE: print(line.encode("utf-8")) elif options.gender == "unknown" and gender == ANDROGYNOUS: print(line.encode("utf-8")) ================================================ FILE: utils/geo.py ================================================ #!/usr/bin/env python """ Filter tweets/retweets that have geocoding. """ from __future__ import print_function import json import fileinput for line in fileinput.input(): tweet = json.loads(line) if "retweeted_status" in tweet: if tweet["retweeted_status"]["geo"]: print(json.dumps(tweet)) elif tweet["geo"]: print(json.dumps(tweet)) ================================================ FILE: utils/geofilter.py ================================================ #!/usr/bin/env python from __future__ import print_function import argparse import json import sys from shapely.geometry import shape def process(line, has_coordinates=None, has_place=None, fence=None): tweet = json.loads(line) coordinates = tweet.get("coordinates") place = tweet.get("place") if any( [ has_coordinates and not coordinates, has_coordinates is False and coordinates, has_place and not place, has_place is False and place, ] ): return if fence and (coordinates or place): if coordinates: location = shape(coordinates) else: location = shape(place["bounding_box"]) if not fence.contains(location): return print(line.strip("\n")) def main(): parser = argparse.ArgumentParser() parser.add_argument( "infile", nargs="?", type=argparse.FileType("r"), default=sys.stdin ) parser.add_argument( "--yes-coordinates", dest="has_coordinates", action="store_true" ) parser.add_argument( "--no-coordinates", dest="has_coordinates", action="store_false" ) parser.add_argument("--yes-place", dest="has_place", action="store_true") parser.add_argument("--no-place", dest="has_place", action="store_false") parser.add_argument("--fence", default=None, help="geojson file with geofence") parser.set_defaults(has_coordinates=None, has_place=None) args = parser.parse_args() fence = None if args.fence: with open(args.fence, "r") as f: fence = shape(json.loads(f.read())) for line in args.infile: process(line, args.has_coordinates, args.has_place, fence) if __name__ == "__main__": main() ================================================ FILE: utils/geojson.py ================================================ #!/usr/bin/env python """ geojson.py reads in tweets and writes out a corresponding geojson file for the tweets. Each feature will include the following properties: * twitter user name * twitter user screename * tweet creation time * tweet status text * profile image url * the tweet url By default both Point and Polygon features will be included, depending on whether the tweet includes a point or is assigned to a place with a bounding box. Optionally you can convert bounding boxes to points with the --centroid parameter, and can also use --fuzz to randomly place the the point inside the bounding box. """ from __future__ import print_function import json import random import argparse import fileinput import dateutil.parser def text(t): return ( t.get("full_text") or t.get("extended_tweet", {}).get("full_text") or t["text"] ).replace("\n", " ") parser = argparse.ArgumentParser() parser.add_argument( "-c", "--centroid", dest="centroid", action="store_true", default=False, help="store centroid instead of a bounding box", ) parser.add_argument( "-f", "--fuzz", type=float, dest="fuzz", default=0, help="add a random lon and lat shift to bounding box centroids (0-0.1)", ) parser.add_argument( "files", nargs="*", default=("-",), help="files to read, if empty, stdin is used" ) args = parser.parse_args() features = [] for line in fileinput.input(files=args.files): tweet = json.loads(line) t = dateutil.parser.parse(tweet["created_at"]) f = { "type": "Feature", "properties": { "name": tweet["user"]["name"], "screen_name": tweet["user"]["screen_name"], "created_at": t.isoformat("T") + "Z", "text": text(tweet), "profile_image_url": tweet["user"]["profile_image_url"], "url": "http://twitter.com/%s/status/%s" % (tweet["user"]["screen_name"], tweet["id_str"]), }, } if tweet["geo"]: f["geometry"] = { "type": "Point", "coordinates": [ tweet["geo"]["coordinates"][1], tweet["geo"]["coordinates"][0], ], } elif tweet["place"] and any(tweet["place"]["bounding_box"]): bbox = tweet["place"]["bounding_box"]["coordinates"][0] if args.centroid: min_x = bbox[0][0] min_y = bbox[0][1] max_x = bbox[2][0] max_y = bbox[2][1] fuzz_x = args.fuzz * random.uniform(-1, 1) fuzz_y = args.fuzz * random.uniform(-1, 1) center_x = ((max_x + min_x) / 2.0) + fuzz_x center_y = ((max_y + min_y) / 2.0) + fuzz_y f["geometry"] = {"type": "Point", "coordinates": [center_x, center_y]} else: f["geometry"] = { "type": "Polygon", "coordinates": [[bbox[0], bbox[1], bbox[2], bbox[3], bbox[0]]], } if "geometry" in f: features.append(f) geojson = {"type": "FeatureCollection", "features": features} print(json.dumps(geojson, indent=2)) ================================================ FILE: utils/json2csv.py ================================================ #!/usr/bin/env python """ A sample JSON to CSV program. Multivalued JSON properties are space delimited CSV columns. If you'd like it adjusted send a pull request! """ from twarc import json2csv import os import sys import json import codecs import argparse import fileinput if sys.version_info[0] < 3: try: import unicodecsv as csv except ImportError: sys.exit("unicodecsv is required for python 2") else: import csv def main(): parser = argparse.ArgumentParser() parser.add_argument("--output", "-o", help="write output to file instead of stdout") parser.add_argument( "--split", "-s", help="if writing to file, split into multiple files with this many lines per " "file", type=int, default=0, ) parser.add_argument( "--extra-field", "-e", help="extra fields to include. Provide a field name and a pointer to " "the field. Example: -e verified user.verified", nargs=2, action="append", ) parser.add_argument( "--excel", "-x", help="create file compatible with Excel", action="store_true" ) parser.add_argument( "files", metavar="FILE", nargs="*", help="files to read, if empty, stdin is used", ) args = parser.parse_args() file_count = 1 csv_file = None if args.output: if args.split: csv_file = codecs.open( numbered_filepath(args.output, file_count), "wb", "utf-8" ) file_count += 1 else: csv_file = codecs.open(args.output, "wb", "utf-8") else: csv_file = sys.stdout sheet = csv.writer(csv_file) extra_headings = [] extra_fields = [] if args.extra_field: for heading, field in args.extra_field: extra_headings.append(heading) extra_fields.append(field) sheet.writerow(get_headings(extra_headings=extra_headings)) files = args.files if len(args.files) > 0 else ("-",) for count, line in enumerate( fileinput.input(files, openhook=fileinput.hook_encoded("utf-8")) ): if args.split and count and count % args.split == 0: csv_file.close() csv_file = codecs.open( numbered_filepath(args.output, file_count), "wb", "utf-8" ) sheet = csv.writer(csv_file) sheet.writerow(get_headings(extra_headings=extra_headings)) file_count += 1 tweet = json.loads(line) sheet.writerow(get_row(tweet, extra_fields=extra_fields, excel=args.excel)) def numbered_filepath(filepath, num): path, ext = os.path.splitext(filepath) return os.path.join("{}-{:0>3}{}".format(path, num, ext)) def get_headings(extra_headings=None): fields = json2csv.get_headings() if extra_headings: fields.extend(extra_headings) return fields def get_row(t, extra_fields=None, excel=False): row = json2csv.get_row(t, excel=excel) if extra_fields: for field in extra_fields: row.append(extra_field(t, field)) return row def extra_field(t, field_str): obj = t for field in field_str.split("."): if field in obj: obj = obj[field] else: return None return obj if __name__ == "__main__": main() ================================================ FILE: utils/media2warc.py ================================================ #!/usr/bin/env python """ This utility extracts media urls from tweet jsonl.gz and save them as warc records. Warcio (https://github.com/webrecorder/warcio) is a dependency and before you can use it you need to: % pip install warcio You run it like this: % python media2warc.py /mnt/tweets/ferguson/tweets-0001.jsonl.gz /mnt/tweets/ferguson/tweets-0001.warc.gz The input file will be checked for duplicate urls to avoid duplicates within the input file. Subsequent runs will be deduplicated using a sqlite db. If an identical-payload-digest is found a revist record is created. The script is able to fetch media resources in multiple threads (maximum 2) by passing --threads (default to a single thread). Please be careful modifying this script to use more than two threads since it can be interpreted as a DoS-attack. """ import os import gzip import json import time import queue import hashlib import logging import sqlite3 import argparse import requests import threading from datetime import timedelta from warcio.warcwriter import WARCWriter from warcio.statusandheaders import StatusAndHeaders q = queue.Queue() out_queue = queue.Queue() BLOCK_SIZE = 25600 class GetResource(threading.Thread): def __init__(self, q): threading.Thread.__init__(self) self.q = q self.rlock = threading.Lock() self.out_queue = out_queue self.d = Dedup() def run(self): while True: host = self.q.get() try: r = requests.get( host, headers={"Accept-Encoding": "identity"}, stream=True ) data = [r.raw.headers.items(), r.raw, host, r.status_code, r.reason] print(data[2]) self.out_queue.put(data) self.q.task_done() except requests.exceptions.RequestException as e: logging.error("%s for %s", e, data[2]) print(e) self.q.task_done() continue class WriteWarc(threading.Thread): def __init__(self, out_queue, warcfile): threading.Thread.__init__(self) self.out_queue = out_queue self.lock = threading.Lock() self.warcfile = warcfile self.dedup = Dedup() def run(self): with open(self.warcfile, "ab") as output: while True: self.lock.acquire() data = self.out_queue.get() writer = WARCWriter(output, gzip=False) headers_list = data[0] http_headers = StatusAndHeaders( "{} {}".format(data[3], data[4]), headers_list, protocol="HTTP/1.0" ) record = writer.create_warc_record( data[2], "response", payload=data[1], http_headers=http_headers ) h = hashlib.sha1() h.update(record.raw_stream.read(BLOCK_SIZE)) if self.dedup.lookup(h.hexdigest()): record = writer.create_warc_record( data[2], "revisit", http_headers=http_headers ) writer.write_record(record) self.out_queue.task_done() self.lock.release() else: self.dedup.save(h.hexdigest(), data[2]) record.raw_stream.seek(0) writer.write_record(record) self.out_queue.task_done() self.lock.release() class Dedup: """ Stolen from warcprox https://github.com/internetarchive/warcprox/blob/master/warcprox/dedup.py """ def __init__(self): self.file = os.path.join(args.archive_dir, "dedup.db") def start(self): conn = sqlite3.connect(self.file) conn.execute( "create table if not exists dedup (" " key varchar(300) primary key," " value varchar(4000)" ");" ) conn.commit() conn.close() def save(self, digest_key, url): conn = sqlite3.connect(self.file) conn.execute( "insert or replace into dedup (key, value) values (?, ?)", (digest_key, url) ) conn.commit() conn.close() def lookup(self, digest_key, url=None): result = False conn = sqlite3.connect(self.file) cursor = conn.execute("select value from dedup where key = ?", (digest_key,)) result_tuple = cursor.fetchone() conn.close() if result_tuple: result = True return result def parse_extended_entities(extended_entities_dict): """Parse media file URL:s form tweet data :extended_entities_dict: :returns: list of media file urls """ urls = [] if "media" in extended_entities_dict.keys(): for item in extended_entities_dict["media"]: # add static image urls.append(item["media_url_https"]) # add best quality video file if "video_info" in item.keys(): max_bitrate = -1 # handle twitters occasional bitrate=0 video_url = None for video in item["video_info"]["variants"]: if "bitrate" in video.keys() and "content_type" in video.keys(): if video["content_type"] == "video/mp4": if int(video["bitrate"]) > max_bitrate: max_bitrate = int(video["bitrate"]) video_url = video["url"] if not video_url: print("Error: No bitrate / content_type") print(item["video_info"]) else: urls.append(video_url) return urls def parse_binlinks_from_tweet(tweetdict): """Parse binary file url:s from a single tweet. :tweetdict: json data dict for tweet :returns: list of urls for media files """ urls = [] if "user" in tweetdict.keys(): urls.append(tweetdict["user"]["profile_image_url_https"]) urls.append(tweetdict["user"]["profile_background_image_url_https"]) if "extended_entities" in tweetdict.keys(): urls.extend(parse_extended_entities(tweetdict["extended_entities"])) return urls def main(): start = time.time() if not os.path.isdir(args.archive_dir): os.mkdir(args.archive_dir) logging.basicConfig( filename=os.path.join(args.archive_dir, "media_harvest.log"), level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", ) logging.getLogger(__name__) logging.info("Logging media harvest for %s", args.tweet_file) urls = [] d = Dedup() d.start() uniqueUrlCount = 0 duplicateUrlCount = 0 if args.tweet_file.endswith(".gz"): tweetfile = gzip.open(args.tweet_file, "r") else: tweetfile = open(args.tweet_file, "r") logging.info("Checking for duplicate urls") for line in tweetfile: tweet = json.loads(line) tweet_urls = parse_binlinks_from_tweet(tweet) for url in tweet_urls: if not url in urls: urls.append(url) q.put(url) uniqueUrlCount += 1 else: duplicateUrlCount += 1 logging.info( "Found %s total media urls %s unique and %s duplicates", uniqueUrlCount + duplicateUrlCount, uniqueUrlCount, duplicateUrlCount, ) threads = int(args.threads) if threads > 2: threads = 2 for i in range(threads): t = GetResource(q) t.daemon = True t.start() wt = WriteWarc(out_queue, os.path.join(args.archive_dir, "warc.warc")) wt.daemon = True wt.start() q.join() out_queue.join() logging.info( "Finished media harvest in %s", str(timedelta(seconds=(time.time() - start))) ) if __name__ == "__main__": parser = argparse.ArgumentParser("archive") parser.add_argument( "tweet_file", action="store", help="a twitter jsonl.gz input file" ) parser.add_argument( "archive_dir", action="store", help="a directory where the resulting warc is stored", ) parser.add_argument( "--threads", action="store", default=1, help="Number of threads that fetches media resources", ) args = parser.parse_args() main() ================================================ FILE: utils/media_urls.py ================================================ #!/usr/bin/env python """ Print out the URLs of images uploaded to Twitter in a tweet json stream. Useful for piping to wget or curl to mass download. In Bash: % wget $(./utils/image_urls.py tweets.jsonl) """ from __future__ import print_function import json import fileinput for line in fileinput.input(openhook=fileinput.hook_encoded("utf8")): tweet = json.loads(line) id = tweet["id_str"] if "media" in tweet["entities"]: for media in tweet["entities"]["media"]: if media["type"] == "photo": print(id, media["media_url_https"]) if "extended_entities" in tweet and "media" in tweet["extended_entities"]: for media in tweet["extended_entities"]["media"]: if media["type"] == "animated_gif": print(id, media["media_url_https"]) if "video_info" in media: for v in media["video_info"]["variants"]: print(id, v["url"]) ================================================ FILE: utils/network.py ================================================ #!/usr/bin/env python # NOTE: # # This script has been ported to the twarc-network plugin for working # with data collected with twarc2. Please see # https://github.com/docnow/twarc-newtwork for details. # # --- # # build a reply, quote, retweet network from a file of tweets and write it # out as a gexf, dot, json or html file. You will need to have networkx # installed and pydotplus if you want to use dot. The html presentation # uses d3 to display the network graph in your browser. # # ./network.py tweets.jsonl network.html # # or # ./network.py tweets.jsonl network.dot # # or # # ./network.py tweets.jsonl network.gexf # # if you would rather have the network oriented around nodes that are users # instead of tweets use the --users flag # # ./network.py --users tweets.jsonl network.gexf # # if you would rather have the network oriented around nodes that are hashtags # instead of tweets or users, use the --hashtags flag # # TODO: this is mostly here some someone can improve it :) import sys import json import networkx import optparse import itertools import time from networkx import nx_pydot from networkx.readwrite import json_graph usage = "network.py tweets.jsonl graph.html" opt_parser = optparse.OptionParser(usage=usage) opt_parser.add_option( "--retweets", dest="retweets", action="store_true", help="include retweets" ) opt_parser.add_option( "--min_subgraph_size", dest="min_subgraph_size", type="int", help="remove any subgraphs with a size smaller than this number", ) opt_parser.add_option( "--max_subgraph_size", dest="max_subgraph_size", type="int", help="remove any subgraphs with a size larger than this number", ) opt_parser.add_option( "--users", dest="users", action="store_true", help="show user relations instead of tweet relations", ) opt_parser.add_option( "--hashtags", dest="hashtags", action="store_true", help="show hashtag relations instead of tweet relations", ) options, args = opt_parser.parse_args() if len(args) != 2: opt_parser.error("must supply input and output file names") tweets, output = args G = networkx.DiGraph() def add(from_user, from_id, to_user, to_id, type, created_at=None): "adds a relation to the graph" # storing start_data will allow for timestamps for gephi timeline, where nodes will appear on screen at their start dataset # and stay on forever after if (options.users or options.hashtags) and to_user: G.add_node(from_user, screen_name=from_user, start_date=created_at) G.add_node(to_user, screen_name=to_user, start_date=created_at) if G.has_edge(from_user, to_user): weight = G[from_user][to_user]["weight"] + 1 else: weight = 1 G.add_edge(from_user, to_user, type=type, weight=weight) elif not options.users and to_id: G.add_node(from_id, screen_name=from_user, type=type) if to_user: G.add_node(to_id, screen_name=to_user) else: G.add_node(to_id) G.add_edge(from_id, to_id, type=type) def to_json(g): j = {"nodes": [], "links": []} for node_id, node_attrs in g.nodes(True): j["nodes"].append( { "id": node_id, "type": node_attrs.get("type"), "screen_name": node_attrs.get("screen_name"), } ) for source, target, attrs in g.edges(data=True): j["links"].append( {"source": source, "target": target, "type": attrs.get("type")} ) return j for line in open(tweets): try: t = json.loads(line) except: continue from_id = t["id_str"] from_user = t["user"]["screen_name"] from_user_id = t["user"]["id_str"] to_user = None to_id = None # standardize raw created at date to dd/MM/yyyy HH:mm:ss created_at_date = time.strftime( "%d/%m/%Y %H:%M:%S", time.strptime(t["created_at"], "%a %b %d %H:%M:%S +0000 %Y"), ) if options.users: for u in t["entities"].get("user_mentions", []): add(from_user, from_id, u["screen_name"], None, "reply", created_at_date) elif options.hashtags: hashtags = t["entities"].get("hashtags", []) hashtag_pairs = list( itertools.combinations(hashtags, 2) ) # list of all possible hashtag pairs for u in hashtag_pairs: # source hashtag: u[0]['text'] # target hashtag: u[1]['text'] add( "#" + u[0]["text"], None, "#" + u[1]["text"], None, "hashtag", created_at_date, ) else: if t.get("in_reply_to_status_id_str"): to_id = t["in_reply_to_status_id_str"] to_user = t["in_reply_to_screen_name"] add(from_user, from_id, to_user, to_id, "reply") if t.get("quoted_status"): to_id = t["quoted_status"]["id_str"] to_user = t["quoted_status"]["user"]["screen_name"] to_user_id = t["quoted_status"]["user"]["id_str"] add(from_user, from_id, to_user, to_id, "quote") if options.retweets and t.get("retweeted_status"): to_id = t["retweeted_status"]["id_str"] to_user = t["retweeted_status"]["user"]["screen_name"] to_user_id = t["retweeted_status"]["user"]["id_str"] add(from_user, from_id, to_user, to_id, "retweet") if options.min_subgraph_size or options.max_subgraph_size: g_copy = G.copy() for g in networkx.connected_component_subgraphs(G): if options.min_subgraph_size and len(g) < options.min_subgraph_size: g_copy.remove_nodes_from(g.nodes()) elif options.max_subgraph_size and len(g) > options.max_subgraph_size: g_copy.remove_nodes_from(g.nodes()) G = g_copy if output.endswith(".gexf"): networkx.write_gexf(G, output) elif output.endswith(".gml"): networkx.write_gml(G, output) elif output.endswith(".dot"): nx_pydot.write_dot(G, output) elif output.endswith(".json"): json.dump(to_json(G), open(output, "w"), indent=2) elif output.endswith(".html"): graph_data = json.dumps(to_json(G), indent=2) html = ( """
""" % graph_data ) open(output, "w").write(html) ================================================ FILE: utils/noretweets.py ================================================ #!/usr/bin/env python """ Given a JSON file, remove any retweets. Example usage: utils/noretweets.py tweets.jsonl > tweets_noretweets.jsonl """ from __future__ import print_function import json import fileinput for line in fileinput.input(): tweet = json.loads(line) if not "retweeted_status" in tweet: print(json.dumps(tweet)) ================================================ FILE: utils/oembeds.py ================================================ #!/usr/bin/env python3 """ oembeds.py will read a stream of tweet JSON and augment .entities.urls with oembed metadata for the URL. It uses the oembedders python module and a sqlite database to prevent multiple lookups for the same URL. Here's an example of how each URL stanza will be augmented: { "url": "https://t.co/ZX6cE5Xbti", "expanded_url": "https://www.youtube.com/watch?v=ybvmu7kM8z0", "display_url": "youtube.com/watch?v=ybvmu7…", "indices": [ 106, 129 ], "oembed": { "html": "", "thumbnail_url": "https://i.ytimg.com/vi/ybvmu7kM8z0/hqdefault.jpg", "thumbnail_height": 360, "width": 480, "thumbnail_width": 480, "provider_url": "https://www.youtube.com/", "type": "video", "version": "1.0", "title": "Obama knew", "provider_name": "YouTube", "author_url": "https://www.youtube.com/channel/UCAql2DyGU2un1Ei2nMYsqOA", "author_name": "Donald J Trump", "height": 270 } } Hopefully your URL won't be political propaganda from a tyrant like this one. """ import json import logging import sqlite3 import fileinput from oembedders import embed def main(): db = OEmbeds() for line in fileinput.input(): tweet = json.loads(line) for ent in tweet["entities"]["urls"]: url = ent.get("unshortened_url") or ent["expanded_url"] if "twitter.com" in url: continue meta, exists = db.get(url) if not exists: try: meta = embed(url) db.put(url, meta) except Exception as e: logging.warn("error while looking up %s: %s", url, e) if meta: ent["oembed"] = meta print(json.dumps(tweet)) class OEmbeds: def __init__(self, path="oembeds.db"): self.db = sqlite3.connect(path) self.db.execute( """ CREATE table IF NOT EXISTS oembeds ( url text PRIMARY KEY, oembed text NOT NULL ) """ ) def put(self, url, metadata): s = json.dumps(metadata) self.db.execute("INSERT INTO oembeds VALUES(?, ?)", [url, s]) self.db.commit() def get(self, url): cursor = self.db.execute("SELECT oembed FROM oembeds WHERE url=?", [url]) result = cursor.fetchone() if result is not None: return json.loads(result[0]), True else: return None, False if __name__ == "__main__": main() ================================================ FILE: utils/remove_limit.py ================================================ #!/usr/bin/env python """ Utility to remove limit warnings from Filter API output. If --warnings was used, you will have the following in output: {"limit": {"track": 2530, "timestamp_ms": "1482168932301"}} This utility removes any limit warnings from output. Usage: remove_limit.py aleppo.jsonl > aleppo_no_warnings.jsonl """ from __future__ import print_function import sys import json import fileinput limitbreaker = '{"limit":{"track":' limit_breaker = '{"limit": {"track":' for line in fileinput.input(): if limitbreaker not in line and limit_breaker not in line: print(json.dumps(line)) ================================================ FILE: utils/retweets.py ================================================ #!/usr/bin/env python """ Prints out the tweet ids and counts of most retweeted. """ from __future__ import print_function import json import optparse import fileinput from collections import defaultdict def main(): parser = optparse.OptionParser() options, argv = parser.parse_args() counts = defaultdict(int) for line in fileinput.input(argv): try: tweet = json.loads(line) except: continue if "retweeted_status" not in tweet: continue rt = tweet["retweeted_status"] id = rt["id_str"] count = rt["retweet_count"] if count > counts[id]: counts[id] = count for id in sorted(counts, key=counts.get, reverse=True): print("{},{}".format(id, counts[id])) if __name__ == "__main__": main() ================================================ FILE: utils/search.py ================================================ #!/usr/bin/env python """ Filter tweet JSON based on a regular expression to apply to the text of the tweet. search.py file1 Or if you want a case insensitive match: search.py -i file1 """ from __future__ import print_function import re import sys import json import argparse import fileinput from twarc import json2csv if len(sys.argv) == 1: sys.exit("usage: search.py file1 file2") parser = argparse.ArgumentParser(description="filter tweets by regex") parser.add_argument( "-i", "--ignore", dest="ignore", action="store_true", help="ignore case" ) parser.add_argument("regex") parser.add_argument( "files", metavar="FILE", nargs="*", default=["-"], help="files to read, if empty, stdin is used", ) args = parser.parse_args() flags = 0 if args.ignore: flags = re.IGNORECASE try: regex = re.compile(args.regex, flags) except Exception as e: sys.exit("error: regex failed to compile: {}".format(e)) for line in fileinput.input(files=args.files): tweet = json.loads(line) text = json2csv.text(tweet) if regex.search(text): print(line, end="") ================================================ FILE: utils/sensitive.py ================================================ #!/usr/bin/env python """ Filter out tweets or retweets that Twitter thinks are sensitive (mostly porn). """ from __future__ import print_function import json import fileinput for line in fileinput.input(): tweet = json.loads(line) if "possibly_sensitive" in tweet and tweet["possibly_sensitive"]: pass elif ( "retweeted_status" in tweet and "possibly_sensitive" in tweet["retweeted_status"] and tweet["retweeted_status"]["possibly_sensitive"] ): pass else: print(json.dumps(tweet)) ================================================ FILE: utils/sort_by_id.py ================================================ #!/usr/bin/env python """ Sort tweets by ID. Twitter IDs are generated in chronologically ascending order, so this is the same as sorting by date. Example usage: utils/sort_by_id.py tweets.jsonl > sorted.jsonl """ from __future__ import print_function import json from operator import itemgetter import fileinput tweets = [] for line in fileinput.input(): tweet = json.loads(line) tweets.append(tweet) tweets = sorted(tweets, key=itemgetter("id")) for tweet in tweets: print(json.dumps(tweet)) # End of file ================================================ FILE: utils/source.py ================================================ #!/usr/bin/env python """ Util to count which clients are most used. Example usage: utils/source.py tweets.jsonl > sources.html """ import json import fileinput from collections import defaultdict summary = defaultdict(int) for line in fileinput.input(): tweet = json.loads(line) source = tweet["source"] summary[source] += 1 sumsort = sorted(summary, key=summary.get, reverse=True) print( """ Twitter client sources

Twitter client sources

created on the command line with twarc
""" ) for source in sumsort: print("".format(source, summary[source])) print( """
{}{}


created on the command line with twarc.

""" ) # End of file ================================================ FILE: utils/tags.py ================================================ #!/usr/bin/env python from __future__ import print_function import json import fileinput import collections counts = collections.Counter() for line in fileinput.input(): tweet = json.loads(line) for tag in tweet["entities"]["hashtags"]: t = tag["text"].lower() counts[t] += 1 for tag, count in counts.most_common(): print("%5i %s" % (count, tag)) ================================================ FILE: utils/times.py ================================================ #!/usr/bin/env python from __future__ import print_function import sys import json import optparse import fileinput import dateutil.parser from dateutil import tz to_zone = tz.tzlocal() opt_parser = optparse.OptionParser() opt_parser.add_option("-f", "--format", dest="format", default="%Y-%m-%d %H:%M:%S") opt_parser.add_option("-l", "--local", dest="local", action="store_true") opts, args = opt_parser.parse_args() for line in fileinput.input(args): try: tweet = json.loads(line) created_at = dateutil.parser.parse(tweet["created_at"]) # convert to local time if opts.local: created_at = created_at.astimezone(to_zone) print(created_at.strftime(opts.format)) except ValueError as e: sys.stderr.write("uhoh: %s\n" % e) ================================================ FILE: utils/twarc-archive.py ================================================ #!/usr/bin/env python """ This little utility uses twarc to write Twitter search results to a directory of your choosing. It will use the previous results to determine when to stop searching. So for example if you want to search for tweets mentioning "ferguson" you can run it: % twarc-archive.py ferguson /mnt/tweets/ferguson The first time you run this it will search twitter for tweets matching "ferguson" and write them to a file: /mnt/tweets/ferguson/tweets-0001.jsonl.gz When you run the exact same command again: % twarc-archive.py ferguson /mnt/tweets/ferguson it will get the first tweet id in tweets-0001.jsonl.gz and use it to write another file which includes any new tweets since that tweet: /mnt/tweets/ferguson/tweets-0002.jsonl.gz This functionality was initially part of twarc.py itself, but has been split out into a separate utility. """ from __future__ import print_function import os import re import sys import gzip import json import twarc import logging import argparse archive_file_fmt = "tweets-%04i.jsonl.gz" archive_file_pat = "tweets-(\d+).jsonl.gz$" def main(): config = os.path.join(os.path.expanduser("~"), ".twarc") e = os.environ.get parser = argparse.ArgumentParser("archive") parser.add_argument( "search", action="store", help="search for tweets matching a query" ) parser.add_argument( "archive_dir", action="store", help="a directory where results are stored" ) parser.add_argument( "--consumer_key", action="store", default=e("CONSUMER_KEY"), help="Twitter API consumer key", ) parser.add_argument( "--consumer_secret", action="store", default=e("CONSUMER_SECRET"), help="Twitter API consumer secret", ) parser.add_argument( "--access_token", action="store", default=e("ACCESS_TOKEN"), help="Twitter API access key", ) parser.add_argument( "--access_token_secret", action="store", default=e("ACCESS_TOKEN_SECRET"), help="Twitter API access token secret", ) parser.add_argument("--profile", action="store", default="main") parser.add_argument( "-c", "--config", default=config, help="Config file containing Twitter keys and secrets. Overridden by environment config.", ) parser.add_argument( "--tweet_mode", action="store", default="extended", dest="tweet_mode", choices=["compat", "extended"], help="set tweet mode", ) parser.add_argument( "--twarc_command", action="store", default="search", choices=["search", "timeline"], help="select twarc command to be used for harvest, currently supports search and timeline", ) args = parser.parse_args() if not os.path.isdir(args.archive_dir): os.mkdir(args.archive_dir) logging.basicConfig( filename=os.path.join(args.archive_dir, "archive.log"), level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", ) lockfile = os.path.join(args.archive_dir, "") + "lockfile" if not os.path.exists(lockfile): pid = os.getpid() lockfile_handle = open(lockfile, "w") lockfile_handle.write(str(pid)) lockfile_handle.close() else: old_pid = "unknown" with open(lockfile, "r") as lockfile_handle: old_pid = lockfile_handle.read() sys.exit( "Another twarc-archive.py process with pid " + old_pid + " is running. If the process is no longer active then it may have been interrupted. In that case remove the 'lockfile' in " + args.archive_dir + " and run the command again." ) logging.info("logging search for %s to %s", args.search, args.archive_dir) t = twarc.Twarc( consumer_key=args.consumer_key, consumer_secret=args.consumer_secret, access_token=args.access_token, access_token_secret=args.access_token_secret, profile=args.profile, config=args.config, tweet_mode=args.tweet_mode, ) last_archive = get_last_archive(args.archive_dir) if last_archive: last_id = json.loads(next(gzip.open(last_archive, "rt")))["id_str"] else: last_id = None if args.twarc_command == "search": tweets = t.search(args.search, since_id=last_id) elif args.twarc_command == "timeline": if re.match("^\d+$", args.search): tweets = t.timeline(userid=args.search, since_id=last_id) else: tweets = t.timeline(screen_name=args.search, since_id=last_id) else: raise Exception("invalid twarc_command %s" % args.twarc_command) next_archive = get_next_archive(args.archive_dir) # we only create the file if there are new tweets to save # this prevents empty archive files fh = None for tweet in tweets: if not fh: fh = gzip.open(next_archive, "wt") logging.info("archived %s", tweet["id_str"]) fh.write(json.dumps(tweet)) fh.write("\n") if fh: fh.close() else: logging.info("no new tweets found for %s", args.search) if os.path.exists(lockfile): os.remove(lockfile) def get_last_archive(archive_dir): count = 0 for filename in os.listdir(archive_dir): m = re.match(archive_file_pat, filename) if m and int(m.group(1)) > count: count = int(m.group(1)) if count != 0: return os.path.join(archive_dir, archive_file_fmt % count) else: return None def get_next_archive(archive_dir): last_archive = get_last_archive(archive_dir) if last_archive: m = re.search(archive_file_pat, last_archive) count = int(m.group(1)) + 1 else: count = 1 return os.path.join(archive_dir, archive_file_fmt % count) if __name__ == "__main__": main() ================================================ FILE: utils/tweet.py ================================================ #!/usr/bin/env python """ Fetch a single tweet as JSON using its id. """ from __future__ import print_function import os import json import twarc import argparse e = os.environ.get parser = argparse.ArgumentParser("tweet.py") parser.add_argument("tweet_id", action="store", help="Tweet ID") parser.add_argument( "--consumer_key", action="store", default=e("CONSUMER_KEY"), help="Twitter API consumer key", ) parser.add_argument( "--consumer_secret", action="store", default=e("CONSUMER_SECRET"), help="Twitter API consumer secret", ) parser.add_argument( "--access_token", action="store", default=e("ACCESS_TOKEN"), help="Twitter API access key", ) parser.add_argument( "--access_token_secret", action="store", default=e("ACCESS_TOKEN_SECRET"), help="Twitter API access token secret", ) args = parser.parse_args() tw = twarc.Twarc( args.consumer_key, args.consumer_secret, args.access_token, args.access_token_secret ) tweet = tw.get("https://api.twitter.com/1.1/statuses/show/%s.json" % args.tweet_id) print(json.dumps(tweet.json(), indent=2)) ================================================ FILE: utils/tweet_compliance.py ================================================ #!/usr/bin/env python """ Supports tweet compliance. See https://developer.twitter.com/en/docs/tweets/compliance/overview. That is, providing the most recent version of a tweet or removing unavailable (deleted or protected) tweets. Also useful for splitting out available tweets from unavailable tweets. Example usage: python tweet_compliance.py test.txt > test.json 2> test_delete.txt For each tweet in a list of tweets or tweet ids provided by standard input or contained in files, looks up the current tweet state. If a tweet is not available and tweet ids are provided, the tweet id is output to standard error. If a tweet is not available and tweets are provided, the (deleted) tweet is output to standard error. Otherwise, the current tweet (i.e., the tweet retrieved from the API) is returned to standard out. Ordering is not guaranteed. Requires Twitter API keys provided in ~/.twarc or environment variables. (See twarc.py.) """ from __future__ import print_function import json import fileinput import twarc import sys import logging # Send logging to file instead of STDERR. logging.basicConfig( filename="tweet_compliance.log", level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", ) t = twarc.Twarc() def process_tweets(tweets): available_tweet_ids = set() # Hydrate the tweets. for tweet in t.hydrate(tweets.keys()): # Keep track of the tweet ids of the tweets that are available. available_tweet_ids.add(tweet["id_str"]) # Print available tweets to STDOUT. print(json.dumps(tweet)) # Find the unavailable tweets. for tweet_id, tweet in tweets.items(): if tweet_id not in available_tweet_ids: # Print tweet or tweet id to STDERR if tweets[tweet_id]: print(json.dumps(tweets[tweet_id]), file=sys.stderr) else: print(tweet_id, file=sys.stderr) tweets = {} for line in (line.rstrip("\n") for line in fileinput.input()): # Add tweet or None to tweet map. tweet_id = line tweet = None if not line.isdigit(): tweet = json.loads(line) tweet_id = tweet["id_str"] tweets[tweet_id] = tweet # When get to 100, process the tweets. if len(tweets) == 100: process_tweets(tweets) tweets.clear() # Process any remaining tweets. if tweets: process_tweets(tweets) ================================================ FILE: utils/tweet_text.py ================================================ #!/usr/bin/env python """ Given a JSON file, return just the text of the tweet. Example usage: utils/tweet_text.py tweets.jsonl > tweets.txt """ from __future__ import print_function import json import fileinput for line in fileinput.input(): tweet = json.loads(line) if "full_text" in tweet: print(tweet["full_text"].encode("utf8")) else: print(tweet["text"].encode("utf8")) ================================================ FILE: utils/tweet_urls.py ================================================ #!/usr/bin/env python """ Used in conjunction with retweet.py. Prints out the retweet count, and url of the retweeted tweet. Takes in the output from retweet.py tweet_urls.py retweets.jsonl > retweets.txt """ from __future__ import print_function import sys import json import fileinput for line in fileinput.input(): try: tweet = json.loads(line) tweet_id = tweet["id_str"] screen_name = tweet["user"]["screen_name"] retweet_count = tweet["retweet_count"] tweet_urls = "https://twitter.com/%s/status/%s" % (screen_name, tweet_id) print("%d retweets of %s" % (retweet_count, tweet_urls)) except Exception as e: sys.stderr.write("uhoh: %s\n" % e) ================================================ FILE: utils/tweetometer.py ================================================ #!/usr/bin/env python3 """ Reads tweet or Twitter user JSON and outputs a CSV of when the user account was created, how many tweets they have sent to date, and their average tweets per hour. The unit of measurement can be changed to second, minute, day and year with the --unit option. """ import json import optparse import fileinput import dateutil.parser from datetime import datetime, timezone op = optparse.OptionParser() op.add_option( "--unit", choices=["second", "minute", "hour", "day", "year"], default="hour" ) opts, args = op.parse_args() if opts.unit == "second": div = 1 elif opts.unit == "minute": div = 60 elif opts.unit == "hour": div = 60 * 60 elif opts.unit == "day": div = 60 * 60 * 24 elif opts.unit == "year": div = 60 * 60 * 24 * 365 now = datetime.now(timezone.utc) print("screen_name,tweets per %s" % opts.unit) for line in fileinput.input(args): t = json.loads(line) if "user" in t: u = t["user"] elif "screen_name" in t: u = t else: raise Exception("not a tweet or user JSON object") created_at = dateutil.parser.parse(u["created_at"]) age = now - created_at unit = age.total_seconds() / float(div) total = u["statuses_count"] tweets_per_unit = total / unit print("%s,%s,%s,%0.2f" % (u["screen_name"], total, created_at, tweets_per_unit)) ================================================ FILE: utils/tweets.py ================================================ #!/usr/bin/env python from __future__ import print_function import json import fileinput import dateutil.parser for line in fileinput.input(): tweet = json.loads(line) created_at = dateutil.parser.parse(tweet["created_at"]) print( ( "[%s] @%s: %s (%s)" % ( created_at.strftime("%Y-%m-%d %H:%M:%S"), tweet["user"]["screen_name"], tweet["text"], tweet["id_str"], ) ).encode("utf8") ) ================================================ FILE: utils/unshrtn.py ================================================ #!/usr/bin/env python3 """ Unfortunately the "expanded_url" as supplied by Twitter aren't fully expanded one hop past t.co. unshrtn.py will attempt to completely unshorten URLs and add them as the "unshortened_url" key to each url, and emit the tweet as JSON again on stdout. This script starts 10 separate processes which talk to an instance of unshrtn that is running: http://github.com/edsu/unshrtn """ import re import json import time import logging import argparse import fileinput import multiprocessing import urllib.request, urllib.parse, urllib.error # number of urls to look up in parallel POOL_SIZE = 10 unshrtn_url = "http://localhost:3000" retries = 2 wait = 15 logging.basicConfig(filename="unshorten.log", level=logging.INFO) def unshrtn_obj(obj): """Pass in an object and have all the object returned with additional unshortened_url keys """ if type(obj) == list: return list(map(unshrtn_obj, obj)) elif type(obj) != dict: return obj url = obj.get("expanded_url") or obj.get("url") if not url or re.match(r"^https?://(api.)?twitter.com/", url): return {k: unshrtn_obj(v) for k, v in obj.items()} u = "{}/?{}".format( unshrtn_url, urllib.parse.urlencode({"url": url.encode("utf8")}) ) resp = None for retry in range(0, retries): try: resp = json.loads(urllib.request.urlopen(u).read().decode("utf-8")) break except Exception as e: logging.error( "http error: %s when looking up %s. Try %s of %s", e, url, retry, retries, ) time.sleep(wait) return {**obj, "unshortened_url": resp["long"]} def rewrite_line(line): try: data = json.loads(line) return json.dumps(unshrtn_obj(data)) except Exception as e: # garbage in, garbage out logging.error(e) return line def main(): global unshrtn_url, retries, wait parser = argparse.ArgumentParser() parser.add_argument( "--pool-size", help="number of urls to look up in parallel", default=POOL_SIZE, type=int, ) parser.add_argument( "--unshrtn", help="url of the unshrtn service", default=unshrtn_url ) parser.add_argument( "--retries", help="number of time to retry if error from unshrtn service", default=retries, type=int, ) parser.add_argument( "--wait", help="number of seconds to wait between retries if error from unshrtn service", default=wait, type=int, ) parser.add_argument( "files", metavar="FILE", nargs="*", help="files to read, if empty, stdin is used", ) args = parser.parse_args() unshrtn_url = args.unshrtn retries = args.retries wait = args.wait pool = multiprocessing.Pool(args.pool_size) for line in pool.imap_unordered( rewrite_line, fileinput.input(files=args.files if len(args.files) > 0 else ("-",)), ): if line != "\n": print(line) if __name__ == "__main__": main() ================================================ FILE: utils/urls.py ================================================ #!/usr/bin/env python3 """ Print out the URLs in a tweet json stream. """ from __future__ import print_function import json import fileinput for line in fileinput.input(): tweet = json.loads(line) for url in tweet["entities"]["urls"]: if "unshortened_url" in url: print(url["unshortened_url"]) elif url.get("expanded_url"): print(url["expanded_url"]) elif url.get("url"): print(url["url"]) ================================================ FILE: utils/users.py ================================================ #!/usr/bin/env python from __future__ import print_function import json import fileinput for line in fileinput.input(): tweet = json.loads(line) print(("%s [%s]" % (tweet["user"]["name"], tweet["user"]["screen_name"]))) ================================================ FILE: utils/validate.py ================================================ #!/usr/bin/env python import sys import json import fileinput line_number = 0 for line in fileinput.input(): line_number += 1 try: tweet = json.loads(line) except Exception as e: sys.stderr.write("invalid JSON (%s) line %s: %s" % (e, line_number, line)) ================================================ FILE: utils/wall.py ================================================ #!/usr/bin/env python """ Feed wall.py your JSON and get a wall of tweets as HTML. If you want to get the wall in chronological order, a handy trick is: % tail -r tweets.jsonl | ./wall.py > wall.html """ import os import re import sys import json import requests import fileinput AVATAR_DIR = "img" def download_file(url): local_filename = url.split("/")[-1] outfile = os.path.join(AVATAR_DIR, local_filename) if not os.path.isfile(outfile): r = requests.get(url, stream=True) with open(outfile, "wb") as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() return local_filename def text(t): return ( t.get("full_text") or t.get("extended_tweet", {}).get("full_text") or t["text"] ).replace("\n", " ") print( """ twarc wall

Title Here

created on the command line with twarc
""" ) # Make avatar directory if not os.path.isdir(AVATAR_DIR): os.makedirs(AVATAR_DIR) # Parse command-line args reverse = False # If args include --reverse, remove first it, # leaving file name(s) (if any) in args if len(sys.argv) > 1: if sys.argv[1] == "--reverse" or sys.argv[1] == "-r": reverse = True del sys.argv[0] lines = fileinput.input() if reverse: buffered_lines = [] for line in lines: buffered_lines.append(line) # Reverse list using slice lines = buffered_lines[::-1] for line in lines: tweet = json.loads(line) # Download avatar url = tweet["user"]["profile_image_url"] filename = download_file(url) t = { "created_at": tweet["created_at"], "name": tweet["user"]["name"], "username": tweet["user"]["screen_name"], "user_url": "https://twitter.com/" + tweet["user"]["screen_name"], "text": text(tweet), "avatar": AVATAR_DIR + "/" + filename, "url": "https://twitter.com/" + tweet["user"]["screen_name"] + "/status/" + tweet["id_str"], } if "retweet_status" in tweet: t["retweet_count"] = tweet["retweet_status"].get("retweet_count", 0) else: t["retweet_count"] = tweet.get("retweet_count", 0) t["favorite_count"] = tweet.get("favorite_count", 0) t["retweet_string"] = "retweet" if t["retweet_count"] == 1 else "retweets" t["favorite_string"] = "like" if t["favorite_count"] == 1 else "likes" for url in tweet["entities"]["urls"]: a = '%(url)s' % url start, end = url["indices"] t["text"] = t["text"][0:start] + a + t["text"][end:] t["text"] = re.sub( "@([A-Za-z0-9_]+)", r'@\g<1>', t["text"] ) t["text"] = re.sub( " #([^ ]+)", r' #\g<1>', t["text"], ) html = ( """
%(name)s
%(username)s

%(text)s

%(retweet_count)s %(retweet_string)s, %(favorite_count)s %(favorite_string)s
""" % t ) print(html) print( """


created on the command line with twarc.

""" ) ================================================ FILE: utils/wayback.py ================================================ #!/usr/bin/env python # # Reads a stream of tweets and checks to see if the tweet is archived at # Internet Archive and optionally requests SavePageNow save it. # # usage: ./wayback.py tweets.jsonl # # see ./wayback.py --help for details import re import json import time import requests import optparse import fileinput def main(files, save, force_save, sleep): count = 0 found_count = 0 for line in fileinput.input(files): tweet = json.loads(line) url = "https://twitter.com/{}/status/{}".format( tweet["user"]["screen_name"], tweet["id_str"] ) count += 1 found = lookup(url) if found: print("{} last archived at {}".format(url, found)) found_count += 1 else: print("{} not archived".format(url)) if (not found and save) or force_save: archive_url = savepagenow(url) if archive_url: print("saved {} as {}".format(url, archive_url)) else: print("save failed for {}".format(url)) time.sleep(sleep) print("") if count > 0: print("{}/{} found".format(found_count, count)) def lookup(url): found = None resp = requests.get("https://archive.org/wayback/available?url={}".format(url)) if resp.status_code == 200: result = resp.json() if "closest" in result["archived_snapshots"]: found = timestamp(result["archived_snapshots"]["closest"]["timestamp"]) return found def savepagenow(url): resp = requests.get("https://web.archive.org/save/" + url) if resp.status_code != 200 or "content-location" not in resp.headers: return False return "https://web.archive.org" + resp.headers["content-location"] def timestamp(s): m = re.match(r"^(\d\d\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)$", s) return "{}-{}-{} {}:{}:{}".format(*m.groups()) if __name__ == "__main__": usage = "usage: %prog [options] tweets.jsonl" parser = optparse.OptionParser(usage) parser.add_option( "--save", action="store_true", dest="save", help="Save tweet at Internet Archive if not archived", ) parser.add_option( "--force-save", action="store_true", dest="force_save", help="Always save at Internet Archive, whether it is archived already or not", ) parser.add_option( "--sleep", dest="sleep", type="int", default=1, help="Time to sleep between requests to Internet Archive", ) (opts, args) = parser.parse_args() main(args, save=opts.save, force_save=opts.force_save, sleep=opts.sleep) ================================================ FILE: utils/webarchives.py ================================================ #!/usr/bin/env python3 """ A program to filter tweets that contain links to a web archive. At the moment it supports archive.org and archive.is, but please add more if you want! """ import json import fileinput archives = ["archive.is", "web.archive.org", "wayback.archive.org"] for line in fileinput.input(): tweet = json.loads(line) for url in tweet["entities"]["urls"]: done = False for host in archives: if host in url["expanded_url"]: print(line, end="") done = True # prevent outputting same data twice if it contains # multiple archive urls if done: break ================================================ FILE: utils/wordcloud.py ================================================ #!/usr/bin/env python from __future__ import print_function import re import sys import json import fileinput def main(): try: from urllib import urlopen # Python 2 except ImportError: from urllib.request import urlopen # Python 3 MAX_WORDS = 100 word_counts = {} stop_words = set( [ "a", "able", "about", "across", "actually", "after", "against", "agreed", "all", "almost", "already", "also", "am", "among", "an", "and", "any", "anyone", "anyway", "are", "as", "at", "be", "because", "been", "being", "between", "but", "by", "can", "cannot", "come", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "getting", "got", "had", "has", "have", "he", "her", "here", "hers", "hey", "hi", "him", "his", "how", "however", "i", "i'd", "i'll", "i'm", "if", "in", "into", "is", "isnt", "isn't", "it", "its", "just", "kind", "last", "latest", "least", "let", "like", "likely", "look", "make", "may", "me", "might", "more", "most", "must", "my", "neither", "new", "no", "nor", "not", "now", "of", "off", "often", "on", "only", "or", "other", "our", "out", "over", "own", "part", "piece", "play", "put", "putting", "rather", "real", "really", "said", "say", "says", "she", "should", "simply", "since", "so", "some", "than", "thanks", "that", "that's", "thats", "the", "their", "them", "then", "there", "these", "they", "they're", "this", "those", "tis", "to", "too", "try", "twas", "us", "use", "used", "uses", "via", "wants", "was", "way", "we", "well", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "you're", "youre", ] ) for line in fileinput.input(): try: tweet = json.loads(line) except: pass for word in text(tweet).split(" "): word = word.lower() word = word.replace(".", "") word = word.replace(",", "") word = word.replace("...", "") word = word.replace("'", "") word = word.replace(":", "") word = word.replace("(", "") word = word.replace(")", "") if len(word) < 3: continue if len(word) > 15: continue if word in stop_words: continue if word[0] in ["@", "#"]: continue if re.match("https?", word): continue if word.startswith("rt"): continue if not re.match("^[a-z]", word, re.IGNORECASE): continue word_counts[word] = word_counts.get(word, 0) + 1 sorted_words = list(word_counts.keys()) sorted_words.sort(key=lambda x: word_counts[x], reverse=True) top_words = sorted_words[0:MAX_WORDS] words = [] count_range = word_counts[top_words[0]] - word_counts[top_words[-1]] + 1 size_ratio = 100.0 / count_range for word in top_words: size = int(word_counts[word] * size_ratio) + 15 words.append({"text": word, "size": size}) wordcloud_js = urlopen( "https://raw.githubusercontent.com/jasondavies/d3-cloud/master/build/d3.layout.cloud.js" ).read() output = """ twarc wordcloud """ % ( wordcloud_js.decode("utf8"), json.dumps(words, indent=2), ) sys.stdout.write(output) def text(t): if "full_text" in t: return t["full_text"] return t["text"] if __name__ == "__main__": main() ================================================ FILE: utils/youtubedl.py ================================================ #!/usr/bin/env python3 """ usage: youtubedl.py [-h] [--max-downloads MAX_DOWNLOADS] [--max-filesize MAX_FILESIZE] [--ignore-livestreams] [--download-dir DOWNLOAD_DIR] [--block BLOCK] [--timeout TIMEOUT] files Download videos in Twitter JSON data. positional arguments: files json files to parse optional arguments: -h, --help show this help message and exit --max-downloads MAX_DOWNLOADS max downloads per URL --max-filesize MAX_FILESIZE max filesize to download (bytes) --ignore-livestreams ignore livestreams which may never end --download-dir DOWNLOAD_DIR directory to download to --block BLOCK hostnames to block (repeatable) --timeout TIMEOUT timeout download after n seconds """ import os import sys import json import time import argparse import logging import fileinput import youtube_dl import multiprocessing as mp from urllib.parse import urlparse from datetime import datetime, timedelta from youtube_dl.utils import match_filter_func parser = argparse.ArgumentParser(description="Download videos in Twitter JSON data.") parser.add_argument("--max-downloads", type=int, help="max downloads per URL") parser.add_argument("--max-filesize", type=int, help="max filesize to download (bytes)") parser.add_argument( "--ignore-livestreams", action="store_true", default=False, help="ignore livestreams which may never end", ) parser.add_argument( "--download-dir", type=str, help="directory to download to", default="youtubedl" ) parser.add_argument("--block", action="append", help="hostnames to block (repeatable)") parser.add_argument( "--timeout", type=int, default=0, help="timeout download after n seconds" ) parser.add_argument("files", action="append", help="json files to parse") def main(): args = parser.parse_args() # make download directory download_dir = args.download_dir if not os.path.isdir(download_dir): os.mkdir(download_dir) # setup logger log_file = "{}/youtubedl.log".format(download_dir) logging.basicConfig(filename=log_file, level=logging.INFO) log = logging.getLogger() # setup youtube_dl config ydl_opts = { "format": "best", "logger": log, "restrictfilenames": True, "ignoreerrors": True, "nooverwrites": True, "writedescription": True, "writeinfojson": True, "writesubtitles": True, "writeautomaticsub": True, "outtmpl": "{}/%(extractor)s/%(id)s/%(title)s.%(ext)s".format(download_dir), "download_archive": "{}/archive.txt".format(download_dir), } if args.ignore_livestreams: ydl_opts["matchfilter"] = match_filter_func("!is_live") if args.max_downloads: ydl_opts["max_downloads"] = args.max_downloads if args.max_filesize: ydl_opts["max_filesize"] = args.max_filesize # keep track of domains to block blocklist = [] if args.block: blocklist = args.block # read in existing mapping file to know which urls we can ignorej seen = set() mapping_file = os.path.join(download_dir, "mapping.tsv") if os.path.isfile(mapping_file): for line in open(mapping_file): url, path = line.split("\t") log.info("found %s in %s", url, mapping_file) seen.add(url) # loop through the tweets results = open(mapping_file, "a") for line in fileinput.input(args.files): tweet = json.loads(line) log.info("analyzing %s", tweet["id_str"]) for e in tweet["entities"]["urls"]: url = e.get("unshortened_url") or e["expanded_url"] # see if we can skip this one if not url: continue if url in seen: log.info("already processed %s", url) continue seen.add(url) # check for blocks uri = urlparse(url) if uri.netloc in blocklist: logging.warn("%s in block list", url) continue # set up a multiprocessing queue to manage the download with a timeout log.info("processing %s", url) q = mp.Queue() p = mp.Process(target=download, args=(url, q, ydl_opts, log)) p.start() started = datetime.now() while True: # if we've exceeded the timeout terminate the process if args.timeout and datetime.now() - started > timedelta( seconds=args.timeout ): log.warning("reached timeout %s", args.timeout) p.terminate() break # if the process is done we can stop elif not p.is_alive(): break # otherwise sleep and the check again time.sleep(1) # if the queue was empty there either wasn't a download or it timed out if q.empty(): filename = "" else: filename = q.get() p.join() # write the result to the mapping file results.write("{}\t{}\n".format(url, filename)) def download(url, q, ydl_opts, log): try: ydl = youtube_dl.YoutubeDL(ydl_opts) info = ydl.extract_info(url) if info: filename = ydl.prepare_filename(info) log.info("downloaded %s as %s", url, filename) else: filename = "" logging.warning("%s doesn't look like a video", url) except youtube_dl.utils.MaxDownloadsReached as e: logging.warning("only %s downloads per url allowed", args.max_downloads) if __name__ == "__main__": main()