Repository: twintproject/twint Branch: master Commit: e7c8a0c764f6 Files: 34 Total size: 136.4 KB Directory structure: gitextract_yl9l9w44/ ├── .github/ │ ├── FUNDING.yml │ ├── ISSUE_TEMPLATE/ │ │ └── ISSUE_TEMPLATE.md │ └── ISSUE_TEMPLATE.md ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── automate.py ├── elasticsearch/ │ └── README.md ├── setup.py ├── test.py └── twint/ ├── __init__.py ├── __version__.py ├── cli.py ├── config.py ├── datelock.py ├── feed.py ├── format.py ├── get.py ├── output.py ├── run.py ├── storage/ │ ├── __init__.py │ ├── db.py │ ├── elasticsearch.py │ ├── panda.py │ ├── write.py │ └── write_meta.py ├── token.py ├── tweet.py ├── url.py ├── user.py └── verbose.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/FUNDING.yml ================================================ # These are supported funding model platforms patreon: twintproject custom: paypal.me/noneprivacy ================================================ FILE: .github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md ================================================ ### Initial Check > If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks. >Make sure you've checked the following: - [] Python version is 3.6; - [] Using the latest version of Twint; - [] Updated Twint with `pip3 install --upgrade -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint`; ### Command Ran >Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue. ### Description of Issue >Please use **as much detail as possible.** ### Environment Details >Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal? ================================================ FILE: .github/ISSUE_TEMPLATE.md ================================================ # Issue Template Please use this template! ### Initial Check > If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks. >Make sure you've checked the following: - [] Python version is 3.6; - [] Updated Twint with `pip3 install --user --upgrade -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint`; - [] I have searched the issues and there are no duplicates of this issue/question/request. ### Command Ran >Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue. ### Description of Issue >Please use **as much detail as possible.** ### Environment Details >Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal? ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class tweets.db # C extensions *.so config.ini twint/storage/mysql.py # Node Dependency directories node_modules/ jspm_packages/ tests/ # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # dotenv .env # virtualenv .venv venv/ ENV/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # output *.csv *.json *.txt test_twint.py ================================================ FILE: .travis.yml ================================================ dist: bionic language: python python: - "3.6" - "3.7" - "3.8" - "nightly" matrix: allow_failures: - python: "nightly" - python: "3.8" install: - pip install -r requirements.txt script: - python test.py deploy: provider: pypi user: "codyzacharias" password: secure: sWWvx50F7KJBtf8z2njc+Q31WIAHiQs4zKEiGD4/7xrshw55H5z+WnqZ9VIP83qm9yKefoRKp7WnaJeXZ3ulZSLn64ue45lqFozWMyGvelRPOKvZi9XPMqBA7+qllR/GseTHSGC3G5EGxac6UEI3irYe3mZXxfjpxNOXVti8rJ2xX8TiJM0AVKRrdDiAstOhMMkXkB7fYXMQALwEp8UoW/UbjbeqsKueXydjStaESNP/QzRFZ3/tuNu+3HMz/olniLUhUWcF/xDbJVpXuaRMUalgqe+BTbDdtUVt/s/GKtpg5GAzJyhQphiCM/huihedUIKSoI+6A8PTzuxrLhB5BMi9pcllED02v7w1enpu5L2l5cRDgQJSOpkxkA5Eese8nxKOOq0KzwDQa3JByrRor8R4yz+p5s4u2r0Rs2A9fkjQYwd/uWBSEIRF4K9WZoniiikahwXq070DMRgV7HbovKSjo5NK5F8j+psrtqPF+OHN2aVfWxbGnezrOOkmzuTHhWZVj3pPSpQU1WFWHo9fPo4I6YstR4q6XjNNjrpY3ojSlv0ThMbUem7zhHTRkRsSA2SpPfqw5E3Jf7vaiQb4M5zkBVqxuq4tXb14GJ26tGD8tel8u8b+ccpkAE9xf+QavP8UHz4PbBhqgFX5TbV/H++cdsICyoZnT35yiaDOELM= on: tags: true python: "3.7" ================================================ FILE: Dockerfile ================================================ FROM python:3.6-buster LABEL maintainer="codyzacharias@pm.me" WORKDIR /root RUN git clone --depth=1 https://github.com/twintproject/twint.git && \ cd /root/twint && \ pip3 install . -r requirements.txt CMD /bin/bash ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2018 Cody Zacharias Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MANIFEST.in ================================================ include README.md LICENSE ================================================ FILE: README.md ================================================ # TWINT - Twitter Intelligence Tool ![2](https://i.imgur.com/iaH3s7z.png) ![3](https://i.imgur.com/hVeCrqL.png) [![PyPI](https://img.shields.io/pypi/v/twint.svg)](https://pypi.org/project/twint/) [![Build Status](https://travis-ci.org/twintproject/twint.svg?branch=master)](https://travis-ci.org/twintproject/twint) [![Python 3.6|3.7|3.8](https://img.shields.io/badge/Python-3.6%2F3.7%2F3.8-blue.svg)](https://www.python.org/download/releases/3.0/) [![GitHub license](https://img.shields.io/github/license/haccer/tweep.svg)](https://github.com/haccer/tweep/blob/master/LICENSE) [![Downloads](https://pepy.tech/badge/twint)](https://pepy.tech/project/twint) [![Downloads](https://pepy.tech/badge/twint/week)](https://pepy.tech/project/twint/week) [![Patreon](https://img.shields.io/endpoint.svg?url=https:%2F%2Fshieldsio-patreon.herokuapp.com%2Ftwintproject)](https://www.patreon.com/twintproject) ![](https://img.shields.io/twitter/follow/noneprivacy.svg?label=Follow&style=social) >No authentication. No API. No limits. Twint is an advanced Twitter scraping tool written in Python that allows for scraping Tweets from Twitter profiles **without** using Twitter's API. Twint utilizes Twitter's search operators to let you scrape Tweets from specific users, scrape Tweets relating to certain topics, hashtags & trends, or sort out *sensitive* information from Tweets like e-mail and phone numbers. I find this very useful, and you can get really creative with it too. Twint also makes special queries to Twitter allowing you to also scrape a Twitter user's followers, Tweets a user has liked, and who they follow **without** any authentication, API, Selenium, or browser emulation. ## tl;dr Benefits Some of the benefits of using Twint vs Twitter API: - Can fetch almost __all__ Tweets (Twitter API limits to last 3200 Tweets only); - Fast initial setup; - Can be used anonymously and without Twitter sign up; - **No rate limitations**. ## Limits imposed by Twitter Twitter limits scrolls while browsing the user timeline. This means that with `.Profile` or with `.Favorites` you will be able to get ~3200 tweets. ## Requirements - Python 3.6; - aiohttp; - aiodns; - beautifulsoup4; - cchardet; - dataclasses - elasticsearch; - pysocks; - pandas (>=0.23.0); - aiohttp_socks; - schedule; - geopy; - fake-useragent; - py-googletransx. ## Installing **Git:** ```bash git clone --depth=1 https://github.com/twintproject/twint.git cd twint pip3 install . -r requirements.txt ``` **Pip:** ```bash pip3 install twint ``` or ```bash pip3 install --user --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint ``` **Pipenv**: ```bash pipenv install git+https://github.com/twintproject/twint.git#egg=twint ``` ### March 2, 2021 Update **Added**: Dockerfile Noticed a lot of people are having issues installing (including me). Please use the Dockerfile temporarily while I look into them. ## CLI Basic Examples and Combos A few simple examples to help you understand the basics: - `twint -u username` - Scrape all the Tweets of a *user* (doesn't include **retweets** but includes **replies**). - `twint -u username -s pineapple` - Scrape all Tweets from the *user*'s timeline containing _pineapple_. - `twint -s pineapple` - Collect every Tweet containing *pineapple* from everyone's Tweets. - `twint -u username --year 2014` - Collect Tweets that were tweeted **before** 2014. - `twint -u username --since "2015-12-20 20:30:15"` - Collect Tweets that were tweeted since 2015-12-20 20:30:15. - `twint -u username --since 2015-12-20` - Collect Tweets that were tweeted since 2015-12-20 00:00:00. - `twint -u username -o file.txt` - Scrape Tweets and save to file.txt. - `twint -u username -o file.csv --csv` - Scrape Tweets and save as a csv file. - `twint -u username --email --phone` - Show Tweets that might have phone numbers or email addresses. - `twint -s "Donald Trump" --verified` - Display Tweets by verified users that Tweeted about Donald Trump. - `twint -g="48.880048,2.385939,1km" -o file.csv --csv` - Scrape Tweets from a radius of 1km around a place in Paris and export them to a csv file. - `twint -u username -es localhost:9200` - Output Tweets to Elasticsearch - `twint -u username -o file.json --json` - Scrape Tweets and save as a json file. - `twint -u username --database tweets.db` - Save Tweets to a SQLite database. - `twint -u username --followers` - Scrape a Twitter user's followers. - `twint -u username --following` - Scrape who a Twitter user follows. - `twint -u username --favorites` - Collect all the Tweets a user has favorited (gathers ~3200 tweet). - `twint -u username --following --user-full` - Collect full user information a person follows - `twint -u username --timeline` - Use an effective method to gather Tweets from a user's profile (Gathers ~3200 Tweets, including **retweets** & **replies**). - `twint -u username --retweets` - Use a quick method to gather the last 900 Tweets (that includes retweets) from a user's profile. - `twint -u username --resume resume_file.txt` - Resume a search starting from the last saved scroll-id. More detail about the commands and options are located in the [wiki](https://github.com/twintproject/twint/wiki/Commands) ## Module Example Twint can now be used as a module and supports custom formatting. **More details are located in the [wiki](https://github.com/twintproject/twint/wiki/Module)** ```python import twint # Configure c = twint.Config() c.Username = "realDonaldTrump" c.Search = "great" # Run twint.run.Search(c) ``` > Output `955511208597184512 2018-01-22 18:43:19 GMT pineapples are the best fruit` ```python import twint c = twint.Config() c.Username = "noneprivacy" c.Custom["tweet"] = ["id"] c.Custom["user"] = ["bio"] c.Limit = 10 c.Store_csv = True c.Output = "none" twint.run.Search(c) ``` ## Storing Options - Write to file; - CSV; - JSON; - SQLite; - Elasticsearch. ## Elasticsearch Setup Details on setting up Elasticsearch with Twint is located in the [wiki](https://github.com/twintproject/twint/wiki/Elasticsearch). ## Graph Visualization ![graph](https://i.imgur.com/EEJqB8n.png) [Graph](https://github.com/twintproject/twint/wiki/Graph) details are also located in the [wiki](https://github.com/twintproject/twint/wiki/Graph). We are developing a Twint Desktop App. ![4](https://i.imgur.com/DzcfIgL.png) ## FAQ > I tried scraping tweets from a user, I know that they exist but I'm not getting them Twitter can shadow-ban accounts, which means that their tweets will not be available via search. To solve this, pass `--profile-full` if you are using Twint via CLI or, if are using Twint as module, add `config.Profile_full = True`. Please note that this process will be quite slow. ## More Examples #### Followers/Following > To get only follower usernames/following usernames `twint -u username --followers` `twint -u username --following` > To get user info of followers/following users `twint -u username --followers --user-full` `twint -u username --following --user-full` #### userlist > To get only user info of user `twint -u username --user-full` > To get user info of users from a userlist `twint --userlist inputlist --user-full` #### tweet translation (experimental) > To get 100 english tweets and translate them to italian `twint -u noneprivacy --csv --output none.csv --lang en --translate --translate-dest it --limit 100` or ```python import twint c = twint.Config() c.Username = "noneprivacy" c.Limit = 100 c.Store_csv = True c.Output = "none.csv" c.Lang = "en" c.Translate = True c.TranslateDest = "it" twint.run.Search(c) ``` Notes: - [Google translate has some quotas](https://cloud.google.com/translate/quotas) ## Featured Blog Posts: - [How to use Twint as an OSINT tool](https://pielco11.ovh/posts/twint-osint/) - [Basic tutorial made by Null Byte](https://null-byte.wonderhowto.com/how-to/mine-twitter-for-targeted-information-with-twint-0193853/) - [Analyzing Tweets with NLP in minutes with Spark, Optimus and Twint](https://towardsdatascience.com/analyzing-tweets-with-nlp-in-minutes-with-spark-optimus-and-twint-a0c96084995f) - [Loading tweets into Kafka and Neo4j](https://markhneedham.com/blog/2019/05/29/loading-tweets-twint-kafka-neo4j/) ## Contact If you have any question, want to join in discussions, or need extra help, you are welcome to join our Twint focused channel at [OSINT team](https://osint.team) ================================================ FILE: automate.py ================================================ import twint import schedule import time # you can change the name of each "job" after "def" if you'd like. def jobone(): print ("Fetching Tweets") c = twint.Config() # choose username (optional) c.Username = "insert username here" # choose search term (optional) c.Search = "insert search term here" # choose beginning time (narrow results) c.Since = "2018-01-01" # set limit on total tweets c.Limit = 1000 # no idea, but makes the csv format properly c.Store_csv = True # format of the csv c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"] # change the name of the csv file c.Output = "filename.csv" twint.run.Search(c) def jobtwo(): print ("Fetching Tweets") c = twint.Config() # choose username (optional) c.Username = "insert username here" # choose search term (optional) c.Search = "insert search term here" # choose beginning time (narrow results) c.Since = "2018-01-01" # set limit on total tweets c.Limit = 1000 # no idea, but makes the csv format properly c.Store_csv = True # format of the csv c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"] # change the name of the csv file c.Output = "filename2.csv" twint.run.Search(c) # run once when you start the program jobone() jobtwo() # run every minute(s), hour, day at, day of the week, day of the week and time. Use "#" to block out which ones you don't want to use. Remove it to active. Also, replace "jobone" and "jobtwo" with your new function names (if applicable) # schedule.every(1).minutes.do(jobone) schedule.every().hour.do(jobone) # schedule.every().day.at("10:30").do(jobone) # schedule.every().monday.do(jobone) # schedule.every().wednesday.at("13:15").do(jobone) # schedule.every(1).minutes.do(jobtwo) schedule.every().hour.do(jobtwo) # schedule.every().day.at("10:30").do(jobtwo) # schedule.every().monday.do(jobtwo) # schedule.every().wednesday.at("13:15").do(jobtwo) while True: schedule.run_pending() time.sleep(1) ================================================ FILE: elasticsearch/README.md ================================================ # Elasticsearch How-To ![dashboard](https://i.imgur.com/BEbtdo5.png) Please read the Wiki [here](https://github.com/twintproject/twint/wiki/Elasticsearch) ================================================ FILE: setup.py ================================================ #!/usr/bin/python3 from setuptools import setup import io import os # Package meta-data NAME = 'twint' DESCRIPTION = 'An advanced Twitter scraping & OSINT tool.' URL = 'https://github.com/twintproject/twint' EMAIL = 'codyzacharias@pm.me' AUTHOR = 'Cody Zacharias' REQUIRES_PYTHON = '>=3.6.0' VERSION = None # Packages required REQUIRED = [ 'aiohttp', 'aiodns', 'beautifulsoup4', 'cchardet', 'dataclasses', 'elasticsearch', 'pysocks', 'pandas', 'aiohttp_socks', 'schedule', 'geopy', 'fake-useragent', 'googletransx' ] here = os.path.abspath(os.path.dirname(__file__)) with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: long_description = '\n' + f.read() # Load the package's __version__.py about = {} if not VERSION: with open(os.path.join(here, NAME, '__version__.py')) as f: exec(f.read(), about) else: about['__version__'] = VERSION setup( name=NAME, version=about['__version__'], description=DESCRIPTION, long_description=long_description, long_description_content_type="text/markdown", author=AUTHOR, author_email=EMAIL, python_requires=REQUIRES_PYTHON, url=URL, packages=['twint', 'twint.storage'], entry_points={ 'console_scripts': [ 'twint = twint.cli:run_as_command', ], }, install_requires=REQUIRED, dependency_links=[ 'git+https://github.com/x0rzkov/py-googletrans#egg=googletrans' ], license='MIT', classifiers=[ 'License :: OSI Approved :: MIT License', 'Programming Language :: Python', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: Implementation :: CPython', ], ) ================================================ FILE: test.py ================================================ import twint import os ''' Test.py - Testing TWINT to make sure everything works. ''' def test_reg(c, run): print("[+] Beginning vanilla test in {}".format(str(run))) run(c) def test_db(c, run): print("[+] Beginning DB test in {}".format(str(run))) c.Database = "test_twint.db" run(c) def custom(c, run, _type): print("[+] Beginning custom {} test in {}".format(_type, str(run))) c.Custom['tweet'] = ["id", "username"] c.Custom['user'] = ["id", "username"] run(c) def test_json(c, run): c.Store_json = True c.Output = "test_twint.json" custom(c, run, "JSON") print("[+] Beginning JSON test in {}".format(str(run))) run(c) def test_csv(c, run): c.Store_csv = True c.Output = "test_twint.csv" custom(c, run, "CSV") print("[+] Beginning CSV test in {}".format(str(run))) run(c) def main(): c = twint.Config() c.Username = "verified" c.Limit = 20 c.Store_object = True # Separate objects are necessary. f = twint.Config() f.Username = "verified" f.Limit = 20 f.Store_object = True f.User_full = True runs = [ twint.run.Profile, # this doesn't twint.run.Search, # this works twint.run.Following, twint.run.Followers, twint.run.Favorites, ] tests = [test_reg, test_json, test_csv, test_db] # Something breaks if we don't split these up for run in runs[:3]: if run == twint.run.Search: c.Since = "2012-1-1 20:30:22" c.Until = "2017-1-1" else: c.Since = "" c.Until = "" for test in tests: test(c, run) for run in runs[3:]: for test in tests: test(f, run) files = ["test_twint.db", "test_twint.json", "test_twint.csv"] for _file in files: os.remove(_file) print("[+] Testing complete!") if __name__ == '__main__': main() ================================================ FILE: twint/__init__.py ================================================ ''' TWINT - Twitter Intelligence Tool (formerly known as Tweep). See wiki on Github for in-depth details. https://github.com/twintproject/twint/wiki Licensed under MIT License Copyright (c) 2018 Cody Zacharias ''' import logging, os from .config import Config from .__version__ import __version__ from . import run _levels = { 'info': logging.INFO, 'debug': logging.DEBUG } _level = os.getenv('TWINT_DEBUG', 'info') _logLevel = _levels[_level] if _level == "debug": logger = logging.getLogger() _output_fn = 'twint.log' logger.setLevel(_logLevel) formatter = logging.Formatter('%(levelname)s:%(asctime)s:%(name)s:%(message)s') fileHandler = logging.FileHandler(_output_fn) fileHandler.setLevel(_logLevel) fileHandler.setFormatter(formatter) logger.addHandler(fileHandler) ================================================ FILE: twint/__version__.py ================================================ VERSION = (2, 1, 21) __version__ = '.'.join(map(str, VERSION)) ================================================ FILE: twint/cli.py ================================================ #!/usr/bin/env python3 ''' Twint.py - Twitter Intelligence Tool (formerly known as Tweep). See wiki on Github for in-depth details. https://github.com/twintproject/twint/wiki Licensed under MIT License Copyright (c) 2018 The Twint Project ''' import sys import os import argparse from . import run from . import config from . import storage def error(_error, message): """ Print errors to stdout """ print("[-] {}: {}".format(_error, message)) sys.exit(0) def check(args): """ Error checking """ if args.username is not None or args.userlist or args.members_list: if args.verified: error("Contradicting Args", "Please use --verified in combination with -s.") if args.userid: error("Contradicting Args", "--userid and -u cannot be used together.") if args.all: error("Contradicting Args", "--all and -u cannot be used together.") elif args.search and args.timeline: error("Contradicting Args", "--s and --tl cannot be used together.") elif args.timeline and not args.username: error("Error", "-tl cannot be used without -u.") elif args.search is None: if args.custom_query is not None: pass elif (args.geo or args.near) is None and not (args.all or args.userid): error("Error", "Please use at least -u, -s, -g or --near.") elif args.all and args.userid: error("Contradicting Args", "--all and --userid cannot be used together") if args.output is None: if args.csv: error("Error", "Please specify an output file (Example: -o file.csv).") elif args.json: error("Error", "Please specify an output file (Example: -o file.json).") if args.backoff_exponent <= 0: error("Error", "Please specifiy a positive value for backoff_exponent") if args.min_wait_time < 0: error("Error", "Please specifiy a non negative value for min_wait_time") def loadUserList(ul, _type): """ Concatenate users """ if os.path.exists(os.path.abspath(ul)): userlist = open(os.path.abspath(ul), "r").read().splitlines() else: userlist = ul.split(",") if _type == "search": un = "" for user in userlist: un += "%20OR%20from%3A" + user return un[15:] return userlist def initialize(args): """ Set default values for config from args """ c = config.Config() c.Username = args.username c.User_id = args.userid c.Search = args.search c.Geo = args.geo c.Location = args.location c.Near = args.near c.Lang = args.lang c.Output = args.output c.Elasticsearch = args.elasticsearch c.Year = args.year c.Since = args.since c.Until = args.until c.Email = args.email c.Phone = args.phone c.Verified = args.verified c.Store_csv = args.csv c.Tabs = args.tabs c.Store_json = args.json c.Show_hashtags = args.hashtags c.Show_cashtags = args.cashtags c.Limit = args.limit c.Count = args.count c.Stats = args.stats c.Database = args.database c.To = args.to c.All = args.all c.Essid = args.essid c.Format = args.format c.User_full = args.user_full # c.Profile_full = args.profile_full c.Pandas_type = args.pandas_type c.Index_tweets = args.index_tweets c.Index_follow = args.index_follow c.Index_users = args.index_users c.Debug = args.debug c.Resume = args.resume c.Images = args.images c.Videos = args.videos c.Media = args.media c.Replies = args.replies c.Pandas_clean = args.pandas_clean c.Proxy_host = args.proxy_host c.Proxy_port = args.proxy_port c.Proxy_type = args.proxy_type c.Tor_control_port = args.tor_control_port c.Tor_control_password = args.tor_control_password c.Retweets = args.retweets c.Custom_query = args.custom_query c.Popular_tweets = args.popular_tweets c.Skip_certs = args.skip_certs c.Hide_output = args.hide_output c.Native_retweets = args.native_retweets c.Min_likes = args.min_likes c.Min_retweets = args.min_retweets c.Min_replies = args.min_replies c.Links = args.links c.Source = args.source c.Members_list = args.members_list c.Filter_retweets = args.filter_retweets c.Translate = args.translate c.TranslateDest = args.translate_dest c.Backoff_exponent = args.backoff_exponent c.Min_wait_time = args.min_wait_time return c def options(): """ Parse arguments """ ap = argparse.ArgumentParser(prog="twint", usage="python3 %(prog)s [options]", description="TWINT - An Advanced Twitter Scraping Tool.") ap.add_argument("-u", "--username", help="User's Tweets you want to scrape.") ap.add_argument("-s", "--search", help="Search for Tweets containing this word or phrase.") ap.add_argument("-g", "--geo", help="Search for geocoded Tweets.") ap.add_argument("--near", help="Near a specified city.") ap.add_argument("--location", help="Show user's location (Experimental).", action="store_true") ap.add_argument("-l", "--lang", help="Search for Tweets in a specific language.") ap.add_argument("-o", "--output", help="Save output to a file.") ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch.") ap.add_argument("--year", help="Filter Tweets before specified year.") ap.add_argument("--since", help="Filter Tweets sent since date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).", metavar="DATE") ap.add_argument("--until", help="Filter Tweets sent until date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).", metavar="DATE") ap.add_argument("--email", help="Filter Tweets that might have email addresses", action="store_true") ap.add_argument("--phone", help="Filter Tweets that might have phone numbers", action="store_true") ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).", action="store_true") ap.add_argument("--csv", help="Write as .csv file.", action="store_true") ap.add_argument("--tabs", help="Separate CSV fields with tab characters, not commas.", action="store_true") ap.add_argument("--json", help="Write as .json file", action="store_true") ap.add_argument("--hashtags", help="Output hashtags in seperate column.", action="store_true") ap.add_argument("--cashtags", help="Output cashtags in seperate column.", action="store_true") ap.add_argument("--userid", help="Twitter user id.") ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).") ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.", action="store_true") ap.add_argument("--stats", help="Show number of replies, retweets, and likes.", action="store_true") ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 database.") ap.add_argument("--to", help="Search Tweets to a user.", metavar="USERNAME") ap.add_argument("--all", help="Search all Tweets associated with a user.", metavar="USERNAME") ap.add_argument("--followers", help="Scrape a person's followers.", action="store_true") ap.add_argument("--following", help="Scrape a person's follows", action="store_true") ap.add_argument("--favorites", help="Scrape Tweets a user has liked.", action="store_true") ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.") ap.add_argument("--proxy-host", help="Proxy hostname or IP.") ap.add_argument("--proxy-port", help="The port of the proxy server.") ap.add_argument("--tor-control-port", help="If proxy-host is set to tor, this is the control port", default=9051) ap.add_argument("--tor-control-password", help="If proxy-host is set to tor, this is the password for the control port", default="my_password") ap.add_argument("--essid", help="Elasticsearch Session ID, use this to differentiate scraping sessions.", nargs="?", default="") ap.add_argument("--userlist", help="Userlist from list or file.") ap.add_argument("--retweets", help="Include user's Retweets (Warning: limited).", action="store_true") ap.add_argument("--format", help="Custom output format (See wiki for details).") ap.add_argument("--user-full", help="Collect all user information (Use with followers or following only).", action="store_true") # I am removing this this feature for the time being, because it is no longer required, default method will do this # ap.add_argument("--profile-full", # help="Slow, but effective method of collecting a user's Tweets and RT.", # action="store_true") ap.add_argument( "-tl", "--timeline", help="Collects every tweet from a User's Timeline. (Tweets, RTs & Replies)", action="store_true", ) ap.add_argument("--translate", help="Get tweets translated by Google Translate.", action="store_true") ap.add_argument("--translate-dest", help="Translate tweet to language (ISO2).", default="en") ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.") ap.add_argument("--pandas-type", help="Specify HDF5 or Pickle (HDF5 as default)", nargs="?", default="HDF5") ap.add_argument("-it", "--index-tweets", help="Custom Elasticsearch Index name for Tweets.", nargs="?", default="twinttweets") ap.add_argument("-if", "--index-follow", help="Custom Elasticsearch Index name for Follows.", nargs="?", default="twintgraph") ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.", nargs="?", default="twintuser") ap.add_argument("--debug", help="Store information in debug logs", action="store_true") ap.add_argument("--resume", help="Resume from Tweet ID.", metavar="TWEET_ID") ap.add_argument("--videos", help="Display only Tweets with videos.", action="store_true") ap.add_argument("--images", help="Display only Tweets with images.", action="store_true") ap.add_argument("--media", help="Display Tweets with only images or videos.", action="store_true") ap.add_argument("--replies", help="Display replies to a subject.", action="store_true") ap.add_argument("-pc", "--pandas-clean", help="Automatically clean Pandas dataframe at every scrape.") ap.add_argument("-cq", "--custom-query", help="Custom search query.") ap.add_argument("-pt", "--popular-tweets", help="Scrape popular tweets instead of recent ones.", action="store_true") ap.add_argument("-sc", "--skip-certs", help="Skip certs verification, useful for SSC.", action="store_false") ap.add_argument("-ho", "--hide-output", help="Hide output, no tweets will be displayed.", action="store_true") ap.add_argument("-nr", "--native-retweets", help="Filter the results for retweets only.", action="store_true") ap.add_argument("--min-likes", help="Filter the tweets by minimum number of likes.") ap.add_argument("--min-retweets", help="Filter the tweets by minimum number of retweets.") ap.add_argument("--min-replies", help="Filter the tweets by minimum number of replies.") ap.add_argument("--links", help="Include or exclude tweets containing one o more links. If not specified" + " you will get both tweets that might contain links or not.") ap.add_argument("--source", help="Filter the tweets for specific source client.") ap.add_argument("--members-list", help="Filter the tweets sent by users in a given list.") ap.add_argument("-fr", "--filter-retweets", help="Exclude retweets from the results.", action="store_true") ap.add_argument("--backoff-exponent", help="Specify a exponent for the polynomial backoff in case of errors.", type=float, default=3.0) ap.add_argument("--min-wait-time", type=float, default=15, help="specifiy a minimum wait time in case of scraping limit error. This value will be adjusted by twint if the value provided does not satisfy the limits constraints") args = ap.parse_args() return args def main(): """ Main """ args = options() check(args) if args.pandas_clean: storage.panda.clean() c = initialize(args) if args.userlist: c.Query = loadUserList(args.userlist, "search") if args.pandas_clean: storage.panda.clean() if args.favorites: if args.userlist: _userlist = loadUserList(args.userlist, "favorites") for _user in _userlist: args.username = _user c = initialize(args) run.Favorites(c) else: run.Favorites(c) elif args.following: if args.userlist: _userlist = loadUserList(args.userlist, "following") for _user in _userlist: args.username = _user c = initialize(args) run.Following(c) else: run.Following(c) elif args.followers: if args.userlist: _userlist = loadUserList(args.userlist, "followers") for _user in _userlist: args.username = _user c = initialize(args) run.Followers(c) else: run.Followers(c) elif args.retweets: # or args.profile_full: if args.userlist: _userlist = loadUserList(args.userlist, "profile") for _user in _userlist: args.username = _user c = initialize(args) run.Profile(c) else: run.Profile(c) elif args.user_full: if args.userlist: _userlist = loadUserList(args.userlist, "userlist") for _user in _userlist: args.username = _user c = initialize(args) run.Lookup(c) else: run.Lookup(c) elif args.timeline: run.Profile(c) else: run.Search(c) def run_as_command(): version = ".".join(str(v) for v in sys.version_info[:2]) if float(version) < 3.6: print("[-] TWINT requires Python version 3.6+.") sys.exit(0) main() if __name__ == '__main__': main() ================================================ FILE: twint/config.py ================================================ from dataclasses import dataclass from typing import Optional @dataclass class Config: Username: Optional[str] = None User_id: Optional[str] = None Search: Optional[str] = None Lookup: bool = False Geo: str = "" Location: bool = False Near: str = None Lang: Optional[str] = None Output: Optional[str] = None Elasticsearch: object = None Year: Optional[int] = None Since: Optional[str] = None Until: Optional[str] = None Email: Optional[str] = None Phone: Optional[str] = None Verified: bool = False Store_csv: bool = False Store_json: bool = False Custom = {"tweet": None, "user": None, "username": None} Show_hashtags: bool = False Show_cashtags: bool = False Limit: Optional[int] = None Count: Optional[int] = None Stats: bool = False Database: object = None To: str = None All = None Debug: bool = False Format = None Essid: str = "" Profile: bool = False Followers: bool = False Following: bool = False Favorites: bool = False TwitterSearch: bool = False User_full: bool = False # Profile_full: bool = False Store_object: bool = False Store_object_tweets_list: list = None Store_object_users_list: list = None Store_object_follow_list: list = None Pandas_type: type = None Pandas: bool = False Index_tweets: str = "twinttweets" Index_follow: str = "twintgraph" Index_users: str = "twintuser" Retries_count: int = 10 Resume: object = None Images: bool = False Videos: bool = False Media: bool = False Replies: bool = False Pandas_clean: bool = True Lowercase: bool = True Pandas_au: bool = True Proxy_host: str = "" Proxy_port: int = 0 Proxy_type: object = None Tor_control_port: int = 9051 Tor_control_password: str = None Retweets: bool = False Query: str = None Hide_output: bool = False Custom_query: str = "" Popular_tweets: bool = False Skip_certs: bool = False Native_retweets: bool = False Min_likes: int = 0 Min_retweets: int = 0 Min_replies: int = 0 Links: Optional[str] = None Source: Optional[str] = None Members_list: Optional[str] = None Filter_retweets: bool = False Translate: bool = False TranslateSrc: str = "en" TranslateDest: str = "en" Backoff_exponent: float = 3.0 Min_wait_time: int = 0 Bearer_token: str = None Guest_token: str = None deleted: list = None ================================================ FILE: twint/datelock.py ================================================ import datetime import logging as logme from .tweet import utc_to_local class Datelock: until = None since = None _since_def_user = None def convertToDateTime(string): dateTimeList = string.split() ListLength = len(dateTimeList) if ListLength == 2: return string if ListLength == 1: return string + " 00:00:00" else: return "" def Set(Until, Since): logme.debug(__name__+':Set') d = Datelock() if Until: d.until = datetime.datetime.strptime(convertToDateTime(Until), "%Y-%m-%d %H:%M:%S") d.until = utc_to_local(d.until) else: d.until = datetime.datetime.today() if Since: d.since = datetime.datetime.strptime(convertToDateTime(Since), "%Y-%m-%d %H:%M:%S") d.since = utc_to_local(d.since) d._since_def_user = True else: d.since = datetime.datetime.strptime("2006-03-21 00:00:00", "%Y-%m-%d %H:%M:%S") d.since = utc_to_local(d.since) d._since_def_user = False return d ================================================ FILE: twint/feed.py ================================================ import time from datetime import datetime from bs4 import BeautifulSoup from re import findall from json import loads import logging as logme from .tweet import utc_to_local, Tweet_formats class NoMoreTweetsException(Exception): def __init__(self, msg): super().__init__(msg) def Follow(response): logme.debug(__name__ + ':Follow') soup = BeautifulSoup(response, "html.parser") follow = soup.find_all("td", "info fifty screenname") cursor = soup.find_all("div", "w-button-more") try: cursor = findall(r'cursor=(.*?)">', str(cursor))[0] except IndexError: logme.critical(__name__ + ':Follow:IndexError') return follow, cursor # TODO: this won't be used by --profile-full anymore. if it isn't used anywhere else, perhaps remove this in future def Mobile(response): logme.debug(__name__ + ':Mobile') soup = BeautifulSoup(response, "html.parser") tweets = soup.find_all("span", "metadata") max_id = soup.find_all("div", "w-button-more") try: max_id = findall(r'max_id=(.*?)">', str(max_id))[0] except Exception as e: logme.critical(__name__ + ':Mobile:' + str(e)) return tweets, max_id def MobileFav(response): soup = BeautifulSoup(response, "html.parser") tweets = soup.find_all("table", "tweet") max_id = soup.find_all("div", "w-button-more") try: max_id = findall(r'max_id=(.*?)">', str(max_id))[0] except Exception as e: print(str(e) + " [x] feed.MobileFav") return tweets, max_id def _get_cursor(response): try: next_cursor = response['timeline']['instructions'][0]['addEntries']['entries'][-1]['content'][ 'operation']['cursor']['value'] except KeyError: # this is needed because after the first request location of cursor is changed next_cursor = response['timeline']['instructions'][-1]['replaceEntry']['entry']['content']['operation'][ 'cursor']['value'] return next_cursor def Json(response): logme.debug(__name__ + ':Json') json_response = loads(response) html = json_response["items_html"] soup = BeautifulSoup(html, "html.parser") feed = soup.find_all("div", "tweet") return feed, json_response["min_position"] def parse_tweets(config, response): logme.debug(__name__ + ':parse_tweets') response = loads(response) if len(response['globalObjects']['tweets']) == 0: msg = 'No more data!' raise NoMoreTweetsException(msg) feed = [] for timeline_entry in response['timeline']['instructions'][0]['addEntries']['entries']: # this will handle the cases when the timeline entry is a tweet if (config.TwitterSearch or config.Profile) and (timeline_entry['entryId'].startswith('sq-I-t-') or timeline_entry['entryId'].startswith('tweet-')): if 'tweet' in timeline_entry['content']['item']['content']: _id = timeline_entry['content']['item']['content']['tweet']['id'] # skip the ads if 'promotedMetadata' in timeline_entry['content']['item']['content']['tweet']: continue elif 'tombstone' in timeline_entry['content']['item']['content'] and 'tweet' in \ timeline_entry['content']['item']['content']['tombstone']: _id = timeline_entry['content']['item']['content']['tombstone']['tweet']['id'] else: _id = None if _id is None: raise ValueError('Unable to find ID of tweet in timeline.') try: temp_obj = response['globalObjects']['tweets'][_id] except KeyError: logme.info('encountered a deleted tweet with id {}'.format(_id)) config.deleted.append(_id) continue temp_obj['user_data'] = response['globalObjects']['users'][temp_obj['user_id_str']] if 'retweeted_status_id_str' in temp_obj: rt_id = temp_obj['retweeted_status_id_str'] _dt = response['globalObjects']['tweets'][rt_id]['created_at'] _dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y') _dt = utc_to_local(_dt) _dt = str(_dt.strftime(Tweet_formats['datetime'])) temp_obj['retweet_data'] = { 'user_rt_id': response['globalObjects']['tweets'][rt_id]['user_id_str'], 'user_rt': response['globalObjects']['tweets'][rt_id]['full_text'], 'retweet_id': rt_id, 'retweet_date': _dt, } feed.append(temp_obj) next_cursor = _get_cursor(response) return feed, next_cursor ================================================ FILE: twint/format.py ================================================ import logging as logme def Tweet(config, t): if config.Format: logme.debug(__name__+':Tweet:Format') output = config.Format.replace("{id}", t.id_str) output = output.replace("{conversation_id}", t.conversation_id) output = output.replace("{date}", t.datestamp) output = output.replace("{time}", t.timestamp) output = output.replace("{user_id}", t.user_id_str) output = output.replace("{username}", t.username) output = output.replace("{name}", t.name) output = output.replace("{place}", t.place) output = output.replace("{timezone}", t.timezone) output = output.replace("{urls}", ",".join(t.urls)) output = output.replace("{photos}", ",".join(t.photos)) output = output.replace("{video}", str(t.video)) output = output.replace("{thumbnail}", t.thumbnail) output = output.replace("{tweet}", t.tweet) output = output.replace("{language}", t.lang) output = output.replace("{hashtags}", ",".join(t.hashtags)) output = output.replace("{cashtags}", ",".join(t.cashtags)) output = output.replace("{replies}", t.replies_count) output = output.replace("{retweets}", t.retweets_count) output = output.replace("{likes}", t.likes_count) output = output.replace("{link}", t.link) output = output.replace("{is_retweet}", str(t.retweet)) output = output.replace("{user_rt_id}", str(t.user_rt_id)) output = output.replace("{quote_url}", t.quote_url) output = output.replace("{near}", t.near) output = output.replace("{geo}", t.geo) output = output.replace("{mentions}", ",".join(t.mentions)) output = output.replace("{translate}", t.translate) output = output.replace("{trans_src}", t.trans_src) output = output.replace("{trans_dest}", t.trans_dest) else: logme.debug(__name__+':Tweet:notFormat') output = f"{t.id_str} {t.datestamp} {t.timestamp} {t.timezone} " # TODO: someone who is familiar with this code, needs to take a look at what this is # if t.retweet: # output += "RT " output += f"<{t.username}> {t.tweet}" if config.Show_hashtags: hashtags = ",".join(t.hashtags) output += f" {hashtags}" if config.Show_cashtags: cashtags = ",".join(t.cashtags) output += f" {cashtags}" if config.Stats: output += f" | {t.replies_count} replies {t.retweets_count} retweets {t.likes_count} likes" if config.Translate: output += f" {t.translate} {t.trans_src} {t.trans_dest}" return output def User(_format, u): if _format: logme.debug(__name__+':User:Format') output = _format.replace("{id}", str(u.id)) output = output.replace("{name}", u.name) output = output.replace("{username}", u.username) output = output.replace("{bio}", u.bio) output = output.replace("{location}", u.location) output = output.replace("{url}", u.url) output = output.replace("{join_date}", u.join_date) output = output.replace("{join_time}", u.join_time) output = output.replace("{tweets}", str(u.tweets)) output = output.replace("{following}", str(u.following)) output = output.replace("{followers}", str(u.followers)) output = output.replace("{likes}", str(u.likes)) output = output.replace("{media}", str(u.media_count)) output = output.replace("{private}", str(u.is_private)) output = output.replace("{verified}", str(u.is_verified)) output = output.replace("{avatar}", u.avatar) if u.background_image: output = output.replace("{background_image}", u.background_image) else: output = output.replace("{background_image}", "") else: logme.debug(__name__+':User:notFormat') output = f"{u.id} | {u.name} | @{u.username} | Private: " output += f"{u.is_private} | Verified: {u.is_verified} |" output += f" Bio: {u.bio} | Location: {u.location} | Url: " output += f"{u.url} | Joined: {u.join_date} {u.join_time} " output += f"| Tweets: {u.tweets} | Following: {u.following}" output += f" | Followers: {u.followers} | Likes: {u.likes} " output += f"| Media: {u.media_count} | Avatar: {u.avatar}" return output ================================================ FILE: twint/get.py ================================================ from async_timeout import timeout from datetime import datetime from bs4 import BeautifulSoup import sys import socket import aiohttp from fake_useragent import UserAgent import asyncio import concurrent.futures import random from json import loads, dumps from aiohttp_socks import ProxyConnector, ProxyType from urllib.parse import quote from . import url from .output import Tweets, Users from .token import TokenExpiryException import logging as logme httpproxy = None user_agent_list = [ # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' # ' Chrome/60.0.3112.113 Safari/537.36', # 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' # ' Chrome/60.0.3112.90 Safari/537.36', # 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' # ' Chrome/60.0.3112.90 Safari/537.36', # 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' # ' Chrome/60.0.3112.90 Safari/537.36', # 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)' # ' Chrome/44.0.2403.157 Safari/537.36', # 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' # ' Chrome/60.0.3112.113 Safari/537.36', # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' # ' Chrome/57.0.2987.133 Safari/537.36', # 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' # ' Chrome/57.0.2987.133 Safari/537.36', # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' # ' Chrome/55.0.2883.87 Safari/537.36', # 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' # ' Chrome/55.0.2883.87 Safari/537.36', 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET ' 'CLR 3.5.30729)', ] # function to convert python `dict` to json and then encode it to be passed in the url as a parameter # some urls require this format def dict_to_url(dct): return quote(dumps(dct)) def get_connector(config): logme.debug(__name__ + ':get_connector') _connector = None if config.Proxy_host: if config.Proxy_host.lower() == "tor": _connector = ProxyConnector( host='127.0.0.1', port=9050, rdns=True) elif config.Proxy_port and config.Proxy_type: if config.Proxy_type.lower() == "socks5": _type = ProxyType.SOCKS5 elif config.Proxy_type.lower() == "socks4": _type = ProxyType.SOCKS4 elif config.Proxy_type.lower() == "http": global httpproxy httpproxy = "http://" + config.Proxy_host + ":" + str(config.Proxy_port) return _connector else: logme.critical("get_connector:proxy-type-error") print("Error: Proxy types allowed are: http, socks5 and socks4. No https.") sys.exit(1) _connector = ProxyConnector( proxy_type=_type, host=config.Proxy_host, port=config.Proxy_port, rdns=True) else: logme.critical(__name__ + ':get_connector:proxy-port-type-error') print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type") sys.exit(1) else: if config.Proxy_port or config.Proxy_type: logme.critical(__name__ + ':get_connector:proxy-host-arg-error') print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type") sys.exit(1) return _connector async def RequestUrl(config, init): logme.debug(__name__ + ':RequestUrl') _connector = get_connector(config) _serialQuery = "" params = [] _url = "" _headers = [("authorization", config.Bearer_token), ("x-guest-token", config.Guest_token)] # TODO : do this later if config.Profile: logme.debug(__name__ + ':RequestUrl:Profile') _url, params, _serialQuery = url.SearchProfile(config, init) elif config.TwitterSearch: logme.debug(__name__ + ':RequestUrl:TwitterSearch') _url, params, _serialQuery = await url.Search(config, init) else: if config.Following: logme.debug(__name__ + ':RequestUrl:Following') _url = await url.Following(config.Username, init) elif config.Followers: logme.debug(__name__ + ':RequestUrl:Followers') _url = await url.Followers(config.Username, init) else: logme.debug(__name__ + ':RequestUrl:Favorites') _url = await url.Favorites(config.Username, init) _serialQuery = _url response = await Request(_url, params=params, connector=_connector, headers=_headers) if config.Debug: print(_serialQuery, file=open("twint-request_urls.log", "a", encoding="utf-8")) return response def ForceNewTorIdentity(config): logme.debug(__name__ + ':ForceNewTorIdentity') try: tor_c = socket.create_connection(('127.0.0.1', config.Tor_control_port)) tor_c.send('AUTHENTICATE "{}"\r\nSIGNAL NEWNYM\r\n'.format(config.Tor_control_password).encode()) response = tor_c.recv(1024) if response != b'250 OK\r\n250 OK\r\n': sys.stderr.write('Unexpected response from Tor control port: {}\n'.format(response)) logme.critical(__name__ + ':ForceNewTorIdentity:unexpectedResponse') except Exception as e: logme.debug(__name__ + ':ForceNewTorIdentity:errorConnectingTor') sys.stderr.write('Error connecting to Tor control port: {}\n'.format(repr(e))) sys.stderr.write('If you want to rotate Tor ports automatically - enable Tor control port\n') async def Request(_url, connector=None, params=None, headers=None): logme.debug(__name__ + ':Request:Connector') async with aiohttp.ClientSession(connector=connector, headers=headers) as session: return await Response(session, _url, params) async def Response(session, _url, params=None): logme.debug(__name__ + ':Response') with timeout(120): async with session.get(_url, ssl=True, params=params, proxy=httpproxy) as response: resp = await response.text() if response.status == 429: # 429 implies Too many requests i.e. Rate Limit Exceeded raise TokenExpiryException(loads(resp)['errors'][0]['message']) return resp async def RandomUserAgent(wa=None): logme.debug(__name__ + ':RandomUserAgent') try: if wa: return "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36" return UserAgent(verify_ssl=False, use_cache_server=False).random except: return random.choice(user_agent_list) async def Username(_id, bearer_token, guest_token): logme.debug(__name__ + ':Username') _dct = {'userId': _id, 'withHighlightedLabel': False} _url = "https://api.twitter.com/graphql/B9FuNQVmyx32rdbIPEZKag/UserByRestId?variables={}".format(dict_to_url(_dct)) _headers = { 'authorization': bearer_token, 'x-guest-token': guest_token, } r = await Request(_url, headers=_headers) j_r = loads(r) username = j_r['data']['user']['legacy']['screen_name'] return username async def Tweet(url, config, conn): logme.debug(__name__ + ':Tweet') try: response = await Request(url) soup = BeautifulSoup(response, "html.parser") tweets = soup.find_all("div", "tweet") await Tweets(tweets, config, conn, url) except Exception as e: logme.critical(__name__ + ':Tweet:' + str(e)) async def User(username, config, conn, user_id=False): logme.debug(__name__ + ':User') _dct = {'screen_name': username, 'withHighlightedLabel': False} _url = 'https://api.twitter.com/graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName?variables={}'\ .format(dict_to_url(_dct)) _headers = { 'authorization': config.Bearer_token, 'x-guest-token': config.Guest_token, } try: response = await Request(_url, headers=_headers) j_r = loads(response) if user_id: try: _id = j_r['data']['user']['rest_id'] return _id except KeyError as e: logme.critical(__name__ + ':User:' + str(e)) return await Users(j_r, config, conn) except Exception as e: logme.critical(__name__ + ':User:' + str(e)) raise def Limit(Limit, count): logme.debug(__name__ + ':Limit') if Limit is not None and count >= int(Limit): return True async def Multi(feed, config, conn): logme.debug(__name__ + ':Multi') count = 0 try: with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: loop = asyncio.get_event_loop() futures = [] for tweet in feed: count += 1 if config.Favorites or config.Profile_full: logme.debug(__name__ + ':Multi:Favorites-profileFull') link = tweet.find("a")["href"] url = f"https://twitter.com{link}&lang=en" elif config.User_full: logme.debug(__name__ + ':Multi:userFull') username = tweet.find("a")["name"] url = f"http://twitter.com/{username}?lang=en" else: logme.debug(__name__ + ':Multi:else-url') link = tweet.find("a", "tweet-timestamp js-permalink js-nav js-tooltip")["href"] url = f"https://twitter.com{link}?lang=en" if config.User_full: logme.debug(__name__ + ':Multi:user-full-Run') futures.append(loop.run_in_executor(executor, await User(url, config, conn))) else: logme.debug(__name__ + ':Multi:notUser-full-Run') futures.append(loop.run_in_executor(executor, await Tweet(url, config, conn))) logme.debug(__name__ + ':Multi:asyncioGather') await asyncio.gather(*futures) except Exception as e: # TODO: fix error not error # print(str(e) + " [x] get.Multi") # will return "'NoneType' object is not callable" # but still works # logme.critical(__name__+':Multi:' + str(e)) pass return count ================================================ FILE: twint/output.py ================================================ from datetime import datetime from . import format, get from .tweet import Tweet from .user import User from .storage import db, elasticsearch, write, panda import logging as logme follows_list = [] tweets_list = [] users_list = [] author_list = {''} author_list.pop() # used by Pandas _follows_object = {} def _formatDateTime(datetimestamp): try: return int(datetime.strptime(datetimestamp, "%Y-%m-%d %H:%M:%S").timestamp()) except ValueError: return int(datetime.strptime(datetimestamp, "%Y-%m-%d").timestamp()) def _clean_follow_list(): logme.debug(__name__ + ':clean_follow_list') global _follows_object _follows_object = {} def clean_lists(): logme.debug(__name__ + ':clean_lists') global follows_list global tweets_list global users_list follows_list = [] tweets_list = [] users_list = [] def datecheck(datetimestamp, config): logme.debug(__name__ + ':datecheck') if config.Since: logme.debug(__name__ + ':datecheck:SinceTrue') d = _formatDateTime(datetimestamp) s = _formatDateTime(config.Since) if d < s: return False if config.Until: logme.debug(__name__ + ':datecheck:UntilTrue') d = _formatDateTime(datetimestamp) s = _formatDateTime(config.Until) if d > s: return False logme.debug(__name__ + ':datecheck:dateRangeFalse') return True # TODO In this method we need to delete the quoted tweets, because twitter also sends the quoted tweets in the # `tweets` list along with the other tweets def is_tweet(tw): try: tw["data-item-id"] logme.debug(__name__ + ':is_tweet:True') return True except: logme.critical(__name__ + ':is_tweet:False') return False def _output(obj, output, config, **extra): logme.debug(__name__ + ':_output') if config.Lowercase: if isinstance(obj, str): logme.debug(__name__ + ':_output:Lowercase:username') obj = obj.lower() elif obj.__class__.__name__ == "user": logme.debug(__name__ + ':_output:Lowercase:user') pass elif obj.__class__.__name__ == "tweet": logme.debug(__name__ + ':_output:Lowercase:tweet') obj.username = obj.username.lower() author_list.update({obj.username}) for dct in obj.mentions: for key, val in dct.items(): dct[key] = val.lower() for i in range(len(obj.hashtags)): obj.hashtags[i] = obj.hashtags[i].lower() for i in range(len(obj.cashtags)): obj.cashtags[i] = obj.cashtags[i].lower() else: logme.info('_output:Lowercase:hiddenTweetFound') print("[x] Hidden tweet found, account suspended due to violation of TOS") return if config.Output != None: if config.Store_csv: try: write.Csv(obj, config) logme.debug(__name__ + ':_output:CSV') except Exception as e: logme.critical(__name__ + ':_output:CSV:Error:' + str(e)) print(str(e) + " [x] output._output") elif config.Store_json: write.Json(obj, config) logme.debug(__name__ + ':_output:JSON') else: write.Text(output, config.Output) logme.debug(__name__ + ':_output:Text') if config.Elasticsearch: logme.debug(__name__ + ':_output:Elasticsearch') print("", end=".", flush=True) else: if not config.Hide_output: try: print(output.replace('\n', ' ')) except UnicodeEncodeError: logme.critical(__name__ + ':_output:UnicodeEncodeError') print("unicode error [x] output._output") async def checkData(tweet, config, conn): logme.debug(__name__ + ':checkData') tweet = Tweet(tweet, config) if not tweet.datestamp: logme.critical(__name__ + ':checkData:hiddenTweetFound') print("[x] Hidden tweet found, account suspended due to violation of TOS") return if datecheck(tweet.datestamp + " " + tweet.timestamp, config): output = format.Tweet(config, tweet) if config.Database: logme.debug(__name__ + ':checkData:Database') db.tweets(conn, tweet, config) if config.Pandas: logme.debug(__name__ + ':checkData:Pandas') panda.update(tweet, config) if config.Store_object: logme.debug(__name__ + ':checkData:Store_object') if hasattr(config.Store_object_tweets_list, 'append'): config.Store_object_tweets_list.append(tweet) else: tweets_list.append(tweet) if config.Elasticsearch: logme.debug(__name__ + ':checkData:Elasticsearch') elasticsearch.Tweet(tweet, config) _output(tweet, output, config) # else: # logme.critical(__name__+':checkData:copyrightedTweet') async def Tweets(tweets, config, conn): logme.debug(__name__ + ':Tweets') if config.Favorites or config.Location: logme.debug(__name__ + ':Tweets:fav+full+loc') for tw in tweets: await checkData(tw, config, conn) elif config.TwitterSearch or config.Profile: logme.debug(__name__ + ':Tweets:TwitterSearch') await checkData(tweets, config, conn) else: logme.debug(__name__ + ':Tweets:else') if int(tweets["data-user-id"]) == config.User_id or config.Retweets: await checkData(tweets, config, conn) async def Users(u, config, conn): logme.debug(__name__ + ':User') global users_list user = User(u) output = format.User(config.Format, user) if config.Database: logme.debug(__name__ + ':User:Database') db.user(conn, config, user) if config.Elasticsearch: logme.debug(__name__ + ':User:Elasticsearch') _save_date = user.join_date _save_time = user.join_time user.join_date = str(datetime.strptime(user.join_date, "%d %b %Y")).split()[0] user.join_time = str(datetime.strptime(user.join_time, "%I:%M %p")).split()[1] elasticsearch.UserProfile(user, config) user.join_date = _save_date user.join_time = _save_time if config.Store_object: logme.debug(__name__ + ':User:Store_object') if hasattr(config.Store_object_follow_list, 'append'): config.Store_object_follow_list.append(user) elif hasattr(config.Store_object_users_list, 'append'): config.Store_object_users_list.append(user) else: users_list.append(user) # twint.user.user if config.Pandas: logme.debug(__name__ + ':User:Pandas+user') panda.update(user, config) _output(user, output, config) async def Username(username, config, conn): logme.debug(__name__ + ':Username') global _follows_object global follows_list follow_var = config.Following * "following" + config.Followers * "followers" if config.Database: logme.debug(__name__ + ':Username:Database') db.follow(conn, config.Username, config.Followers, username) if config.Elasticsearch: logme.debug(__name__ + ':Username:Elasticsearch') elasticsearch.Follow(username, config) if config.Store_object: if hasattr(config.Store_object_follow_list, 'append'): config.Store_object_follow_list.append(username) else: follows_list.append(username) # twint.user.user if config.Pandas: logme.debug(__name__ + ':Username:object+pandas') try: _ = _follows_object[config.Username][follow_var] except KeyError: _follows_object.update({config.Username: {follow_var: []}}) _follows_object[config.Username][follow_var].append(username) if config.Pandas_au: logme.debug(__name__ + ':Username:object+pandas+au') panda.update(_follows_object[config.Username], config) _output(username, username, config) ================================================ FILE: twint/run.py ================================================ import sys, os, datetime from asyncio import get_event_loop, TimeoutError, ensure_future, new_event_loop, set_event_loop from . import datelock, feed, get, output, verbose, storage from .token import TokenExpiryException from . import token from .storage import db from .feed import NoMoreTweetsException import logging as logme import time bearer = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs' \ '%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' class Twint: def __init__(self, config): logme.debug(__name__ + ':Twint:__init__') if config.Resume is not None and (config.TwitterSearch or config.Followers or config.Following): logme.debug(__name__ + ':Twint:__init__:Resume') self.init = self.get_resume(config.Resume) else: self.init = -1 config.deleted = [] self.feed: list = [-1] self.count = 0 self.user_agent = "" self.config = config self.config.Bearer_token = bearer # TODO might have to make some adjustments for it to work with multi-treading # USAGE : to get a new guest token simply do `self.token.refresh()` self.token = token.Token(config) self.token.refresh() self.conn = db.Conn(config.Database) self.d = datelock.Set(self.config.Until, self.config.Since) verbose.Elastic(config.Elasticsearch) if self.config.Store_object: logme.debug(__name__ + ':Twint:__init__:clean_follow_list') output._clean_follow_list() if self.config.Pandas_clean: logme.debug(__name__ + ':Twint:__init__:pandas_clean') storage.panda.clean() def get_resume(self, resumeFile): if not os.path.exists(resumeFile): return '-1' with open(resumeFile, 'r') as rFile: _init = rFile.readlines()[-1].strip('\n') return _init async def Feed(self): logme.debug(__name__ + ':Twint:Feed') consecutive_errors_count = 0 while True: # this will receive a JSON string, parse it into a `dict` and do the required stuff try: response = await get.RequestUrl(self.config, self.init) except TokenExpiryException as e: logme.debug(__name__ + 'Twint:Feed:' + str(e)) self.token.refresh() response = await get.RequestUrl(self.config, self.init) if self.config.Debug: print(response, file=open("twint-last-request.log", "w", encoding="utf-8")) self.feed = [] try: if self.config.Favorites: self.feed, self.init = feed.MobileFav(response) favorite_err_cnt = 0 if len(self.feed) == 0 and len(self.init) == 0: while (len(self.feed) == 0 or len(self.init) == 0) and favorite_err_cnt < 5: self.user_agent = await get.RandomUserAgent(wa=False) response = await get.RequestUrl(self.config, self.init, headers=[("User-Agent", self.user_agent)]) self.feed, self.init = feed.MobileFav(response) favorite_err_cnt += 1 time.sleep(1) if favorite_err_cnt == 5: print("Favorite page could not be fetched") if not self.count % 40: time.sleep(5) elif self.config.Followers or self.config.Following: self.feed, self.init = feed.Follow(response) if not self.count % 40: time.sleep(5) elif self.config.Profile or self.config.TwitterSearch: try: self.feed, self.init = feed.parse_tweets(self.config, response) except NoMoreTweetsException as e: logme.debug(__name__ + ':Twint:Feed:' + str(e)) print('[!] ' + str(e) + ' Scraping will stop now.') print('found {} deleted tweets in this search.'.format(len(self.config.deleted))) break break except TimeoutError as e: if self.config.Proxy_host.lower() == "tor": print("[?] Timed out, changing Tor identity...") if self.config.Tor_control_password is None: logme.critical(__name__ + ':Twint:Feed:tor-password') sys.stderr.write("Error: config.Tor_control_password must be set for proxy auto-rotation!\r\n") sys.stderr.write( "Info: What is it? See https://stem.torproject.org/faq.html#can-i-interact-with-tors" "-controller-interface-directly\r\n") break else: get.ForceNewTorIdentity(self.config) continue else: logme.critical(__name__ + ':Twint:Feed:' + str(e)) print(str(e)) break except Exception as e: if self.config.Profile or self.config.Favorites: print("[!] Twitter does not return more data, scrape stops here.") break logme.critical(__name__ + ':Twint:Feed:noData' + str(e)) # Sometimes Twitter says there is no data. But it's a lie. # raise consecutive_errors_count += 1 if consecutive_errors_count < self.config.Retries_count: # skip to the next iteration if wait time does not satisfy limit constraints delay = round(consecutive_errors_count ** self.config.Backoff_exponent, 1) # if the delay is less than users set min wait time then replace delay if self.config.Min_wait_time > delay: delay = self.config.Min_wait_time sys.stderr.write('sleeping for {} secs\n'.format(delay)) time.sleep(delay) self.user_agent = await get.RandomUserAgent(wa=True) continue logme.critical(__name__ + ':Twint:Feed:Tweets_known_error:' + str(e)) sys.stderr.write(str(e) + " [x] run.Feed") sys.stderr.write( "[!] if you get this error but you know for sure that more tweets exist, please open an issue and " "we will investigate it!") break if self.config.Resume: print(self.init, file=open(self.config.Resume, "a", encoding="utf-8")) async def follow(self): await self.Feed() if self.config.User_full: logme.debug(__name__ + ':Twint:follow:userFull') self.count += await get.Multi(self.feed, self.config, self.conn) else: logme.debug(__name__ + ':Twint:follow:notUserFull') for user in self.feed: self.count += 1 username = user.find("a")["name"] await output.Username(username, self.config, self.conn) async def favorite(self): logme.debug(__name__ + ':Twint:favorite') await self.Feed() favorited_tweets_list = [] for tweet in self.feed: tweet_dict = {} self.count += 1 try: tweet_dict['data-item-id'] = tweet.find("div", {"class": "tweet-text"})['data-id'] t_url = tweet.find("span", {"class": "metadata"}).find("a")["href"] tweet_dict['data-conversation-id'] = t_url.split('?')[0].split('/')[-1] tweet_dict['username'] = tweet.find("div", {"class": "username"}).text.replace('\n', '').replace(' ', '') tweet_dict['tweet'] = tweet.find("div", {"class": "tweet-text"}).find("div", {"class": "dir-ltr"}).text date_str = tweet.find("td", {"class": "timestamp"}).find("a").text # test_dates = ["1m", "2h", "Jun 21, 2019", "Mar 12", "28 Jun 19"] # date_str = test_dates[3] if len(date_str) <= 3 and (date_str[-1] == "m" or date_str[-1] == "h"): # 25m 1h dateu = str(datetime.date.today()) tweet_dict['date'] = dateu elif ',' in date_str: # Aug 21, 2019 sp = date_str.replace(',', '').split(' ') date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + sp[2] dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d") tweet_dict['date'] = dateu elif len(date_str.split(' ')) == 3: # 28 Jun 19 sp = date_str.split(' ') if len(sp[2]) == 2: sp[2] = '20' + sp[2] date_str_formatted = sp[0] + ' ' + sp[1] + ' ' + sp[2] dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d") tweet_dict['date'] = dateu else: # Aug 21 sp = date_str.split(' ') date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + str(datetime.date.today().year) dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d") tweet_dict['date'] = dateu favorited_tweets_list.append(tweet_dict) except Exception as e: logme.critical(__name__ + ':Twint:favorite:favorite_field_lack') print("shit: ", date_str, " ", str(e)) try: self.config.favorited_tweets_list += favorited_tweets_list except AttributeError: self.config.favorited_tweets_list = favorited_tweets_list async def profile(self): await self.Feed() logme.debug(__name__ + ':Twint:profile') for tweet in self.feed: self.count += 1 await output.Tweets(tweet, self.config, self.conn) async def tweets(self): await self.Feed() # TODO : need to take care of this later if self.config.Location: logme.debug(__name__ + ':Twint:tweets:location') self.count += await get.Multi(self.feed, self.config, self.conn) else: logme.debug(__name__ + ':Twint:tweets:notLocation') for tweet in self.feed: self.count += 1 await output.Tweets(tweet, self.config, self.conn) async def main(self, callback=None): task = ensure_future(self.run()) # Might be changed to create_task in 3.7+. if callback: task.add_done_callback(callback) await task async def run(self): if self.config.TwitterSearch: self.user_agent = await get.RandomUserAgent(wa=True) else: self.user_agent = await get.RandomUserAgent() if self.config.User_id is not None and self.config.Username is None: logme.debug(__name__ + ':Twint:main:user_id') self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token, self.config.Guest_token) if self.config.Username is not None and self.config.User_id is None: logme.debug(__name__ + ':Twint:main:username') self.config.User_id = await get.User(self.config.Username, self.config, self.conn, True) if self.config.User_id is None: raise ValueError("Cannot find twitter account with name = " + self.config.Username) # TODO : will need to modify it to work with the new endpoints if self.config.TwitterSearch and self.config.Since and self.config.Until: logme.debug(__name__ + ':Twint:main:search+since+until') while self.d.since < self.d.until: self.config.Since = datetime.datetime.strftime(self.d.since, "%Y-%m-%d %H:%M:%S") self.config.Until = datetime.datetime.strftime(self.d.until, "%Y-%m-%d %H:%M:%S") if len(self.feed) > 0: await self.tweets() else: logme.debug(__name__ + ':Twint:main:gettingNewTweets') break if get.Limit(self.config.Limit, self.count): break elif self.config.Lookup: await self.Lookup() else: logme.debug(__name__ + ':Twint:main:not-search+since+until') while True: if len(self.feed) > 0: if self.config.Followers or self.config.Following: logme.debug(__name__ + ':Twint:main:follow') await self.follow() elif self.config.Favorites: logme.debug(__name__ + ':Twint:main:favorites') await self.favorite() elif self.config.Profile: logme.debug(__name__ + ':Twint:main:profile') await self.profile() elif self.config.TwitterSearch: logme.debug(__name__ + ':Twint:main:twitter-search') await self.tweets() else: logme.debug(__name__ + ':Twint:main:no-more-tweets') break # logging.info("[<] " + str(datetime.now()) + ':: run+Twint+main+CallingGetLimit2') if get.Limit(self.config.Limit, self.count): logme.debug(__name__ + ':Twint:main:reachedLimit') break if self.config.Count: verbose.Count(self.count, self.config) async def Lookup(self): logme.debug(__name__ + ':Twint:Lookup') try: if self.config.User_id is not None and self.config.Username is None: logme.debug(__name__ + ':Twint:Lookup:user_id') self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token, self.config.Guest_token) await get.User(self.config.Username, self.config, db.Conn(self.config.Database)) except Exception as e: logme.exception(__name__ + ':Twint:Lookup:Unexpected exception occurred.') raise def run(config, callback=None): logme.debug(__name__ + ':run') try: get_event_loop() except RuntimeError as e: if "no current event loop" in str(e): set_event_loop(new_event_loop()) else: logme.exception(__name__ + ':run:Unexpected exception while handling an expected RuntimeError.') raise except Exception as e: logme.exception( __name__ + ':run:Unexpected exception occurred while attempting to get or create a new event loop.') raise get_event_loop().run_until_complete(Twint(config).main(callback)) def Favorites(config): logme.debug(__name__ + ':Favorites') config.Favorites = True config.Following = False config.Followers = False config.Profile = False config.TwitterSearch = False run(config) if config.Pandas_au: storage.panda._autoget("tweet") def Followers(config): logme.debug(__name__ + ':Followers') config.Followers = True config.Following = False config.Profile = False config.Favorites = False config.TwitterSearch = False run(config) if config.Pandas_au: storage.panda._autoget("followers") if config.User_full: storage.panda._autoget("user") if config.Pandas_clean and not config.Store_object: # storage.panda.clean() output._clean_follow_list() def Following(config): logme.debug(__name__ + ':Following') config.Following = True config.Followers = False config.Profile = False config.Favorites = False config.TwitterSearch = False run(config) if config.Pandas_au: storage.panda._autoget("following") if config.User_full: storage.panda._autoget("user") if config.Pandas_clean and not config.Store_object: # storage.panda.clean() output._clean_follow_list() def Lookup(config): logme.debug(__name__ + ':Lookup') config.Profile = False config.Lookup = True config.Favorites = False config.FOllowing = False config.Followers = False config.TwitterSearch = False run(config) if config.Pandas_au: storage.panda._autoget("user") def Profile(config): logme.debug(__name__ + ':Profile') config.Profile = True config.Favorites = False config.Following = False config.Followers = False config.TwitterSearch = False run(config) if config.Pandas_au: storage.panda._autoget("tweet") def Search(config, callback=None): logme.debug(__name__ + ':Search') config.TwitterSearch = True config.Favorites = False config.Following = False config.Followers = False config.Profile = False run(config, callback) if config.Pandas_au: storage.panda._autoget("tweet") ================================================ FILE: twint/storage/__init__.py ================================================ ================================================ FILE: twint/storage/db.py ================================================ import sqlite3 import sys import time import hashlib from datetime import datetime def Conn(database): if database: print("[+] Inserting into Database: " + str(database)) conn = init(database) if isinstance(conn, str): # error print(conn) sys.exit(1) else: conn = "" return conn def init(db): try: conn = sqlite3.connect(db) cursor = conn.cursor() table_users = """ CREATE TABLE IF NOT EXISTS users( id integer not null, id_str text not null, name text, username text not null, bio text, location text, url text, join_date text not null, join_time text not null, tweets integer, following integer, followers integer, likes integer, media integer, private integer not null, verified integer not null, profile_image_url text not null, background_image text, hex_dig text not null, time_update integer not null, CONSTRAINT users_pk PRIMARY KEY (id, hex_dig) ); """ cursor.execute(table_users) table_tweets = """ CREATE TABLE IF NOT EXISTS tweets ( id integer not null, id_str text not null, tweet text default '', language text default '', conversation_id text not null, created_at integer not null, date text not null, time text not null, timezone text not null, place text default '', replies_count integer, likes_count integer, retweets_count integer, user_id integer not null, user_id_str text not null, screen_name text not null, name text default '', link text, mentions text, hashtags text, cashtags text, urls text, photos text, thumbnail text, quote_url text, video integer, geo text, near text, source text, time_update integer not null, `translate` text default '', trans_src text default '', trans_dest text default '', PRIMARY KEY (id) ); """ cursor.execute(table_tweets) table_retweets = """ CREATE TABLE IF NOT EXISTS retweets( user_id integer not null, username text not null, tweet_id integer not null, retweet_id integer not null, retweet_date integer, CONSTRAINT retweets_pk PRIMARY KEY(user_id, tweet_id), CONSTRAINT user_id_fk FOREIGN KEY(user_id) REFERENCES users(id), CONSTRAINT tweet_id_fk FOREIGN KEY(tweet_id) REFERENCES tweets(id) ); """ cursor.execute(table_retweets) table_reply_to = """ CREATE TABLE IF NOT EXISTS replies( tweet_id integer not null, user_id integer not null, username text not null, CONSTRAINT replies_pk PRIMARY KEY (user_id, tweet_id), CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id) ); """ cursor.execute(table_reply_to) table_favorites = """ CREATE TABLE IF NOT EXISTS favorites( user_id integer not null, tweet_id integer not null, CONSTRAINT favorites_pk PRIMARY KEY (user_id, tweet_id), CONSTRAINT user_id_fk FOREIGN KEY (user_id) REFERENCES users(id), CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id) ); """ cursor.execute(table_favorites) table_followers = """ CREATE TABLE IF NOT EXISTS followers ( id integer not null, follower_id integer not null, CONSTRAINT followers_pk PRIMARY KEY (id, follower_id), CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id), CONSTRAINT follower_id_fk FOREIGN KEY(follower_id) REFERENCES users(id) ); """ cursor.execute(table_followers) table_following = """ CREATE TABLE IF NOT EXISTS following ( id integer not null, following_id integer not null, CONSTRAINT following_pk PRIMARY KEY (id, following_id), CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id), CONSTRAINT following_id_fk FOREIGN KEY(following_id) REFERENCES users(id) ); """ cursor.execute(table_following) table_followers_names = """ CREATE TABLE IF NOT EXISTS followers_names ( user text not null, time_update integer not null, follower text not null, PRIMARY KEY (user, follower) ); """ cursor.execute(table_followers_names) table_following_names = """ CREATE TABLE IF NOT EXISTS following_names ( user text not null, time_update integer not null, follows text not null, PRIMARY KEY (user, follows) ); """ cursor.execute(table_following_names) return conn except Exception as e: return str(e) def fTable(Followers): if Followers: table = "followers_names" else: table = "following_names" return table def uTable(Followers): if Followers: table = "followers" else: table = "following" return table def follow(conn, Username, Followers, User): try: time_ms = round(time.time()*1000) cursor = conn.cursor() entry = (User, time_ms, Username,) table = fTable(Followers) query = f"INSERT INTO {table} VALUES(?,?,?)" cursor.execute(query, entry) conn.commit() except sqlite3.IntegrityError: pass def get_hash_id(conn, id): cursor = conn.cursor() cursor.execute('SELECT hex_dig FROM users WHERE id = ? LIMIT 1', (id,)) resultset = cursor.fetchall() return resultset[0][0] if resultset else -1 def user(conn, config, User): try: time_ms = round(time.time()*1000) cursor = conn.cursor() user = [int(User.id), User.id, User.name, User.username, User.bio, User.location, User.url,User.join_date, User.join_time, User.tweets, User.following, User.followers, User.likes, User.media_count, User.is_private, User.is_verified, User.avatar, User.background_image] hex_dig = hashlib.sha256(','.join(str(v) for v in user).encode()).hexdigest() entry = tuple(user) + (hex_dig,time_ms,) old_hash = get_hash_id(conn, User.id) if old_hash == -1 or old_hash != hex_dig: query = f"INSERT INTO users VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" cursor.execute(query, entry) else: pass if config.Followers or config.Following: table = uTable(config.Followers) query = f"INSERT INTO {table} VALUES(?,?)" cursor.execute(query, (config.User_id, int(User.id))) conn.commit() except sqlite3.IntegrityError: pass def tweets(conn, Tweet, config): try: time_ms = round(time.time()*1000) cursor = conn.cursor() entry = (Tweet.id, Tweet.id_str, Tweet.tweet, Tweet.lang, Tweet.conversation_id, Tweet.datetime, Tweet.datestamp, Tweet.timestamp, Tweet.timezone, Tweet.place, Tweet.replies_count, Tweet.likes_count, Tweet.retweets_count, Tweet.user_id, Tweet.user_id_str, Tweet.username, Tweet.name, Tweet.link, ",".join(Tweet.mentions), ",".join(Tweet.hashtags), ",".join(Tweet.cashtags), ",".join(Tweet.urls), ",".join(Tweet.photos), Tweet.thumbnail, Tweet.quote_url, Tweet.video, Tweet.geo, Tweet.near, Tweet.source, time_ms, Tweet.translate, Tweet.trans_src, Tweet.trans_dest) cursor.execute('INSERT INTO tweets VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) if config.Favorites: query = 'INSERT INTO favorites VALUES(?,?)' cursor.execute(query, (config.User_id, Tweet.id)) if Tweet.retweet: query = 'INSERT INTO retweets VALUES(?,?,?,?,?)' _d = datetime.timestamp(datetime.strptime(Tweet.retweet_date, "%Y-%m-%d %H:%M:%S")) cursor.execute(query, (int(Tweet.user_rt_id), Tweet.user_rt, Tweet.id, int(Tweet.retweet_id), _d)) if Tweet.reply_to: for reply in Tweet.reply_to: query = 'INSERT INTO replies VALUES(?,?,?)' cursor.execute(query, (Tweet.id, int(reply['user_id']), reply['username'])) conn.commit() except sqlite3.IntegrityError: pass ================================================ FILE: twint/storage/elasticsearch.py ================================================ ## TODO - Fix Weekday situation from elasticsearch import Elasticsearch, helpers from geopy.geocoders import Nominatim from datetime import datetime import contextlib import sys _index_tweet_status = False _index_follow_status = False _index_user_status = False _is_near_def = False _is_location_def = False _near = {} _location = {} geolocator = Nominatim(user_agent="twint-1.2") class RecycleObject(object): def write(self, junk): pass def flush(self): pass def getLocation(place, **options): location = geolocator.geocode(place,timeout=1000) if location: if options.get("near"): global _near _near = {"lat": location.latitude, "lon": location.longitude} return True elif options.get("location"): global _location _location = {"lat": location.latitude, "lon": location.longitude} return True return {"lat": location.latitude, "lon": location.longitude} else: return {} def handleIndexResponse(response): try: if response["status"] == 400: return True except KeyError: pass if response["acknowledged"]: print("[+] Index \"" + response["index"] + "\" created!") else: print("[x] error index creation :: storage.elasticsearch.handleIndexCreation") if response["shards_acknowledged"]: print("[+] Shards acknowledged, everything is ready to be used!") return True else: print("[x] error with shards :: storage.elasticsearch.HandleIndexCreation") return False def createIndex(config, instance, **scope): if scope.get("scope") == "tweet": tweets_body = { "mappings": { "properties": { "id": {"type": "long"}, "conversation_id": {"type": "long"}, "created_at": {"type": "text"}, "date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"}, "timezone": {"type": "keyword"}, "place": {"type": "keyword"}, "location": {"type": "keyword"}, "tweet": {"type": "text"}, "lang": {"type": "keyword"}, "hashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"}, "cashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"}, "user_id_str": {"type": "keyword"}, "username": {"type": "keyword", "normalizer": "hashtag_normalizer"}, "name": {"type": "text"}, "profile_image_url": {"type": "text"}, "day": {"type": "integer"}, "hour": {"type": "integer"}, "link": {"type": "text"}, "retweet": {"type": "text"}, "essid": {"type": "keyword"}, "nlikes": {"type": "integer"}, "nreplies": {"type": "integer"}, "nretweets": {"type": "integer"}, "quote_url": {"type": "text"}, "video": {"type":"integer"}, "thumbnail": {"type":"text"}, "search": {"type": "text"}, "near": {"type": "text"}, "geo_near": {"type": "geo_point"}, "geo_tweet": {"type": "geo_point"}, "photos": {"type": "text"}, "user_rt_id": {"type": "keyword"}, "mentions": {"type": "keyword", "normalizer": "hashtag_normalizer"}, "source": {"type": "keyword"}, "user_rt": {"type": "keyword"}, "retweet_id": {"type": "keyword"}, "reply_to": { "type": "nested", "properties": { "user_id": {"type": "keyword"}, "username": {"type": "keyword"} } }, "retweet_date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss", "ignore_malformed": True}, "urls": {"type": "keyword"}, "translate": {"type": "text"}, "trans_src": {"type": "keyword"}, "trans_dest": {"type": "keyword"}, } }, "settings": { "number_of_shards": 1, "analysis": { "normalizer": { "hashtag_normalizer": { "type": "custom", "char_filter": [], "filter": ["lowercase", "asciifolding"] } } } } } with nostdout(): resp = instance.indices.create(index=config.Index_tweets, body=tweets_body, ignore=400) return handleIndexResponse(resp) elif scope.get("scope") == "follow": follow_body = { "mappings": { "properties": { "user": {"type": "keyword"}, "follow": {"type": "keyword"}, "essid": {"type": "keyword"} } }, "settings": { "number_of_shards": 1 } } with nostdout(): resp = instance.indices.create(index=config.Index_follow, body=follow_body, ignore=400) return handleIndexResponse(resp) elif scope.get("scope") == "user": user_body = { "mappings": { "properties": { "id": {"type": "keyword"}, "name": {"type": "keyword"}, "username": {"type": "keyword"}, "bio": {"type": "text"}, "location": {"type": "keyword"}, "url": {"type": "text"}, "join_datetime": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"}, "tweets": {"type": "integer"}, "following": {"type": "integer"}, "followers": {"type": "integer"}, "likes": {"type": "integer"}, "media": {"type": "integer"}, "private": {"type": "integer"}, "verified": {"type": "integer"}, "avatar": {"type": "text"}, "background_image": {"type": "text"}, "session": {"type": "keyword"}, "geo_user": {"type": "geo_point"} } }, "settings": { "number_of_shards": 1 } } with nostdout(): resp = instance.indices.create(index=config.Index_users, body=user_body, ignore=400) return handleIndexResponse(resp) else: print("[x] error index pre-creation :: storage.elasticsearch.createIndex") return False @contextlib.contextmanager def nostdout(): savestdout = sys.stdout sys.stdout = RecycleObject() yield sys.stdout = savestdout def weekday(day): weekdays = { "Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4, "Friday": 5, "Saturday": 6, "Sunday": 7, } return weekdays[day] def Tweet(Tweet, config): global _index_tweet_status global _is_near_def date_obj = datetime.strptime(Tweet.datetime, "%Y-%m-%d %H:%M:%S %Z") actions = [] try: retweet = Tweet.retweet except AttributeError: retweet = None dt = f"{Tweet.datestamp} {Tweet.timestamp}" j_data = { "_index": config.Index_tweets, "_id": str(Tweet.id) + "_raw_" + config.Essid, "_source": { "id": str(Tweet.id), "conversation_id": Tweet.conversation_id, "created_at": Tweet.datetime, "date": dt, "timezone": Tweet.timezone, "place": Tweet.place, "tweet": Tweet.tweet, "language": Tweet.lang, "hashtags": Tweet.hashtags, "cashtags": Tweet.cashtags, "user_id_str": Tweet.user_id_str, "username": Tweet.username, "name": Tweet.name, "day": date_obj.weekday(), "hour": date_obj.hour, "link": Tweet.link, "retweet": retweet, "essid": config.Essid, "nlikes": int(Tweet.likes_count), "nreplies": int(Tweet.replies_count), "nretweets": int(Tweet.retweets_count), "quote_url": Tweet.quote_url, "video": Tweet.video, "search": str(config.Search), "near": config.Near } } if retweet is not None: j_data["_source"].update({"user_rt_id": Tweet.user_rt_id}) j_data["_source"].update({"user_rt": Tweet.user_rt}) j_data["_source"].update({"retweet_id": Tweet.retweet_id}) j_data["_source"].update({"retweet_date": Tweet.retweet_date}) if Tweet.reply_to: j_data["_source"].update({"reply_to": Tweet.reply_to}) if Tweet.photos: _photos = [] for photo in Tweet.photos: _photos.append(photo) j_data["_source"].update({"photos": _photos}) if Tweet.thumbnail: j_data["_source"].update({"thumbnail": Tweet.thumbnail}) if Tweet.mentions: _mentions = [] for mention in Tweet.mentions: _mentions.append(mention) j_data["_source"].update({"mentions": _mentions}) if Tweet.urls: _urls = [] for url in Tweet.urls: _urls.append(url) j_data["_source"].update({"urls": _urls}) if config.Near or config.Geo: if not _is_near_def: __geo = "" __near = "" if config.Geo: __geo = config.Geo if config.Near: __near = config.Near _is_near_def = getLocation(__near + __geo, near=True) if _near: j_data["_source"].update({"geo_near": _near}) if Tweet.place: _t_place = getLocation(Tweet.place) if _t_place: j_data["_source"].update({"geo_tweet": getLocation(Tweet.place)}) if Tweet.source: j_data["_source"].update({"source": Tweet.Source}) if config.Translate: j_data["_source"].update({"translate": Tweet.translate}) j_data["_source"].update({"trans_src": Tweet.trans_src}) j_data["_source"].update({"trans_dest": Tweet.trans_dest}) actions.append(j_data) es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs) if not _index_tweet_status: _index_tweet_status = createIndex(config, es, scope="tweet") with nostdout(): helpers.bulk(es, actions, chunk_size=2000, request_timeout=200) actions = [] def Follow(user, config): global _index_follow_status actions = [] if config.Following: _user = config.Username _follow = user else: _user = user _follow = config.Username j_data = { "_index": config.Index_follow, "_id": _user + "_" + _follow + "_" + config.Essid, "_source": { "user": _user, "follow": _follow, "essid": config.Essid } } actions.append(j_data) es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs) if not _index_follow_status: _index_follow_status = createIndex(config, es, scope="follow") with nostdout(): helpers.bulk(es, actions, chunk_size=2000, request_timeout=200) actions = [] def UserProfile(user, config): global _index_user_status global _is_location_def actions = [] j_data = { "_index": config.Index_users, "_id": user.id + "_" + user.join_date + "_" + user.join_time + "_" + config.Essid, "_source": { "id": user.id, "name": user.name, "username": user.username, "bio": user.bio, "location": user.location, "url": user.url, "join_datetime": user.join_date + " " + user.join_time, "tweets": user.tweets, "following": user.following, "followers": user.followers, "likes": user.likes, "media": user.media_count, "private": user.is_private, "verified": user.is_verified, "avatar": user.avatar, "background_image": user.background_image, "session": config.Essid } } if config.Location: if not _is_location_def: _is_location_def = getLocation(user.location, location=True) if _location: j_data["_source"].update({"geo_user": _location}) actions.append(j_data) es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs) if not _index_user_status: _index_user_status = createIndex(config, es, scope="user") with nostdout(): helpers.bulk(es, actions, chunk_size=2000, request_timeout=200) actions = [] ================================================ FILE: twint/storage/panda.py ================================================ import datetime, pandas as pd, warnings from time import strftime, localtime from twint.tweet import Tweet_formats Tweets_df = None Follow_df = None User_df = None _object_blocks = { "tweet": [], "user": [], "following": [], "followers": [] } weekdays = { "Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4, "Friday": 5, "Saturday": 6, "Sunday": 7, } _type = "" def _concat(df, _type): if df is None: df = pd.DataFrame(_object_blocks[_type]) else: _df = pd.DataFrame(_object_blocks[_type]) df = pd.concat([df, _df], sort=True) return df def _autoget(_type): global Tweets_df global Follow_df global User_df if _type == "tweet": Tweets_df = _concat(Tweets_df, _type) elif _type == "followers" or _type == "following": Follow_df = _concat(Follow_df, _type) elif _type == "user": User_df = _concat(User_df, _type) else: error("[x] Wrong type of object passed") def update(object, config): global _type #try: # _type = ((object.__class__.__name__ == "tweet")*"tweet" + # (object.__class__.__name__ == "user")*"user") #except AttributeError: # _type = config.Following*"following" + config.Followers*"followers" if object.__class__.__name__ == "tweet": _type = "tweet" elif object.__class__.__name__ == "user": _type = "user" elif object.__class__.__name__ == "dict": _type = config.Following*"following" + config.Followers*"followers" if _type == "tweet": Tweet = object datetime_ms = datetime.datetime.strptime(Tweet.datetime, Tweet_formats['datetime']).timestamp() * 1000 day = weekdays[strftime("%A", localtime(datetime_ms/1000))] dt = f"{object.datestamp} {object.timestamp}" _data = { "id": str(Tweet.id), "conversation_id": Tweet.conversation_id, "created_at": datetime_ms, "date": dt, "timezone": Tweet.timezone, "place": Tweet.place, "tweet": Tweet.tweet, "language": Tweet.lang, "hashtags": Tweet.hashtags, "cashtags": Tweet.cashtags, "user_id": Tweet.user_id, "user_id_str": Tweet.user_id_str, "username": Tweet.username, "name": Tweet.name, "day": day, "hour": strftime("%H", localtime(datetime_ms/1000)), "link": Tweet.link, "urls": Tweet.urls, "photos": Tweet.photos, "video": Tweet.video, "thumbnail": Tweet.thumbnail, "retweet": Tweet.retweet, "nlikes": int(Tweet.likes_count), "nreplies": int(Tweet.replies_count), "nretweets": int(Tweet.retweets_count), "quote_url": Tweet.quote_url, "search": str(config.Search), "near": Tweet.near, "geo": Tweet.geo, "source": Tweet.source, "user_rt_id": Tweet.user_rt_id, "user_rt": Tweet.user_rt, "retweet_id": Tweet.retweet_id, "reply_to": Tweet.reply_to, "retweet_date": Tweet.retweet_date, "translate": Tweet.translate, "trans_src": Tweet.trans_src, "trans_dest": Tweet.trans_dest } _object_blocks[_type].append(_data) elif _type == "user": user = object try: background_image = user.background_image except: background_image = "" _data = { "id": user.id, "name": user.name, "username": user.username, "bio": user.bio, "url": user.url, "join_datetime": user.join_date + " " + user.join_time, "join_date": user.join_date, "join_time": user.join_time, "tweets": user.tweets, "location": user.location, "following": user.following, "followers": user.followers, "likes": user.likes, "media": user.media_count, "private": user.is_private, "verified": user.is_verified, "avatar": user.avatar, "background_image": background_image, } _object_blocks[_type].append(_data) elif _type == "followers" or _type == "following": _data = { config.Following*"following" + config.Followers*"followers" : {config.Username: object[_type]} } _object_blocks[_type] = _data else: print("Wrong type of object passed!") def clean(): global Tweets_df global Follow_df global User_df _object_blocks["tweet"].clear() _object_blocks["following"].clear() _object_blocks["followers"].clear() _object_blocks["user"].clear() Tweets_df = None Follow_df = None User_df = None def save(_filename, _dataframe, **options): if options.get("dataname"): _dataname = options.get("dataname") else: _dataname = "twint" if not options.get("type"): with warnings.catch_warnings(): warnings.simplefilter("ignore") _store = pd.HDFStore(_filename + ".h5") _store[_dataname] = _dataframe _store.close() elif options.get("type") == "Pickle": with warnings.catch_warnings(): warnings.simplefilter("ignore") _dataframe.to_pickle(_filename + ".pkl") else: print("""Please specify: filename, DataFrame, DataFrame name and type (HDF5, default, or Pickle)""") def read(_filename, **options): if not options.get("dataname"): _dataname = "twint" else: _dataname = options.get("dataname") if not options.get("type"): _store = pd.HDFStore(_filename + ".h5") _df = _store[_dataname] return _df elif options.get("type") == "Pickle": _df = pd.read_pickle(_filename + ".pkl") return _df else: print("""Please specify: DataFrame, DataFrame name (twint as default), filename and type (HDF5, default, or Pickle""") ================================================ FILE: twint/storage/write.py ================================================ from . import write_meta as meta import csv import json import os def outputExt(objType, fType): if objType == "str": objType = "username" outExt = f"/{objType}s.{fType}" return outExt def addExt(base, objType, fType): if len(base.split('.')) == 1: createDirIfMissing(base) base += outputExt(objType, fType) return base def Text(entry, f): print(entry.replace('\n', ' '), file=open(f, "a", encoding="utf-8")) def Type(config): if config.User_full: _type = "user" elif config.Followers or config.Following: _type = "username" else: _type = "tweet" return _type def struct(obj, custom, _type): if custom: fieldnames = custom row = {} for f in fieldnames: row[f] = meta.Data(obj, _type)[f] else: fieldnames = meta.Fieldnames(_type) row = meta.Data(obj, _type) return fieldnames, row def createDirIfMissing(dirname): if not os.path.exists(dirname): os.makedirs(dirname) def Csv(obj, config): _obj_type = obj.__class__.__name__ if _obj_type == "str": _obj_type = "username" fieldnames, row = struct(obj, config.Custom[_obj_type], _obj_type) base = addExt(config.Output, _obj_type, "csv") dialect = 'excel-tab' if 'Tabs' in config.__dict__ else 'excel' if not (os.path.exists(base)): with open(base, "w", newline='', encoding="utf-8") as csv_file: writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect) writer.writeheader() with open(base, "a", newline='', encoding="utf-8") as csv_file: writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect) writer.writerow(row) def Json(obj, config): _obj_type = obj.__class__.__name__ if _obj_type == "str": _obj_type = "username" null, data = struct(obj, config.Custom[_obj_type], _obj_type) base = addExt(config.Output, _obj_type, "json") with open(base, "a", newline='', encoding="utf-8") as json_file: json.dump(data, json_file, ensure_ascii=False) json_file.write("\n") ================================================ FILE: twint/storage/write_meta.py ================================================ def tweetData(t): data = { "id": int(t.id), "conversation_id": t.conversation_id, "created_at": t.datetime, "date": t.datestamp, "time": t.timestamp, "timezone": t.timezone, "user_id": t.user_id, "username": t.username, "name": t.name, "place": t.place, "tweet": t.tweet, "language": t.lang, "mentions": t.mentions, "urls": t.urls, "photos": t.photos, "replies_count": int(t.replies_count), "retweets_count": int(t.retweets_count), "likes_count": int(t.likes_count), "hashtags": t.hashtags, "cashtags": t.cashtags, "link": t.link, "retweet": t.retweet, "quote_url": t.quote_url, "video": t.video, "thumbnail": t.thumbnail, "near": t.near, "geo": t.geo, "source": t.source, "user_rt_id": t.user_rt_id, "user_rt": t.user_rt, "retweet_id": t.retweet_id, "reply_to": t.reply_to, "retweet_date": t.retweet_date, "translate": t.translate, "trans_src": t.trans_src, "trans_dest": t.trans_dest, } return data def tweetFieldnames(): fieldnames = [ "id", "conversation_id", "created_at", "date", "time", "timezone", "user_id", "username", "name", "place", "tweet", "language", "mentions", "urls", "photos", "replies_count", "retweets_count", "likes_count", "hashtags", "cashtags", "link", "retweet", "quote_url", "video", "thumbnail", "near", "geo", "source", "user_rt_id", "user_rt", "retweet_id", "reply_to", "retweet_date", "translate", "trans_src", "trans_dest" ] return fieldnames def userData(u): data = { "id": int(u.id), "name": u.name, "username": u.username, "bio": u.bio, "location": u.location, "url": u.url, "join_date": u.join_date, "join_time": u.join_time, "tweets": int(u.tweets), "following": int(u.following), "followers": int(u.followers), "likes": int(u.likes), "media": int(u.media_count), "private": u.is_private, "verified": u.is_verified, "profile_image_url": u.avatar, "background_image": u.background_image } return data def userFieldnames(): fieldnames = [ "id", "name", "username", "bio", "location", "url", "join_date", "join_time", "tweets", "following", "followers", "likes", "media", "private", "verified", "profile_image_url", "background_image" ] return fieldnames def usernameData(u): return {"username": u} def usernameFieldnames(): return ["username"] def Data(obj, _type): if _type == "user": ret = userData(obj) elif _type == "username": ret = usernameData(obj) else: ret = tweetData(obj) return ret def Fieldnames(_type): if _type == "user": ret = userFieldnames() elif _type == "username": ret = usernameFieldnames() else: ret = tweetFieldnames() return ret ================================================ FILE: twint/token.py ================================================ import re import time import requests import logging as logme class TokenExpiryException(Exception): def __init__(self, msg): super().__init__(msg) class RefreshTokenException(Exception): def __init__(self, msg): super().__init__(msg) class Token: def __init__(self, config): self._session = requests.Session() self._session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0'}) self.config = config self._retries = 5 self._timeout = 10 self.url = 'https://twitter.com' def _request(self): for attempt in range(self._retries + 1): # The request is newly prepared on each retry because of potential cookie updates. req = self._session.prepare_request(requests.Request('GET', self.url)) logme.debug(f'Retrieving {req.url}') try: r = self._session.send(req, allow_redirects=True, timeout=self._timeout) except requests.exceptions.RequestException as exc: if attempt < self._retries: retrying = ', retrying' level = logme.WARNING else: retrying = '' level = logme.ERROR logme.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}') else: success, msg = (True, None) msg = f': {msg}' if msg else '' if success: logme.debug(f'{req.url} retrieved successfully{msg}') return r if attempt < self._retries: # TODO : might wanna tweak this back-off timer sleep_time = 2.0 * 2 ** attempt logme.info(f'Waiting {sleep_time:.0f} seconds') time.sleep(sleep_time) else: msg = f'{self._retries + 1} requests to {self.url} failed, giving up.' logme.fatal(msg) self.config.Guest_token = None raise RefreshTokenException(msg) def refresh(self): logme.debug('Retrieving guest token') res = self._request() match = re.search(r'\("gt=(\d+);', res.text) if match: logme.debug('Found guest token in HTML') self.config.Guest_token = str(match.group(1)) else: self.config.Guest_token = None raise RefreshTokenException('Could not find the Guest token in HTML') ================================================ FILE: twint/tweet.py ================================================ from time import strftime, localtime from datetime import datetime, timezone import logging as logme from googletransx import Translator # ref. # - https://github.com/x0rzkov/py-googletrans#basic-usage translator = Translator() class tweet: """Define Tweet class """ type = "tweet" def __init__(self): pass def utc_to_local(utc_dt): return utc_dt.replace(tzinfo=timezone.utc).astimezone(tz=None) Tweet_formats = { 'datetime': '%Y-%m-%d %H:%M:%S %Z', 'datestamp': '%Y-%m-%d', 'timestamp': '%H:%M:%S' } def _get_mentions(tw): """Extract mentions from tweet """ logme.debug(__name__ + ':get_mentions') try: mentions = [ { 'screen_name': _mention['screen_name'], 'name': _mention['name'], 'id': _mention['id_str'], } for _mention in tw['entities']['user_mentions'] if tw['display_text_range'][0] < _mention['indices'][0] ] except KeyError: mentions = [] return mentions def _get_reply_to(tw): try: reply_to = [ { 'screen_name': _mention['screen_name'], 'name': _mention['name'], 'id': _mention['id_str'], } for _mention in tw['entities']['user_mentions'] if tw['display_text_range'][0] > _mention['indices'][1] ] except KeyError: reply_to = [] return reply_to def getText(tw): """Replace some text """ logme.debug(__name__ + ':getText') text = tw['full_text'] text = text.replace("http", " http") text = text.replace("pic.twitter", " pic.twitter") text = text.replace("\n", " ") return text def Tweet(tw, config): """Create Tweet object """ logme.debug(__name__ + ':Tweet') t = tweet() t.id = int(tw['id_str']) t.id_str = tw["id_str"] t.conversation_id = tw["conversation_id_str"] # parsing date to user-friendly format _dt = tw['created_at'] _dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y') _dt = utc_to_local(_dt) t.datetime = str(_dt.strftime(Tweet_formats['datetime'])) # date is of the format year, t.datestamp = _dt.strftime(Tweet_formats['datestamp']) t.timestamp = _dt.strftime(Tweet_formats['timestamp']) t.user_id = int(tw["user_id_str"]) t.user_id_str = tw["user_id_str"] t.username = tw["user_data"]['screen_name'] t.name = tw["user_data"]['name'] t.place = tw['geo'] if 'geo' in tw and tw['geo'] else "" t.timezone = strftime("%z", localtime()) t.mentions = _get_mentions(tw) t.reply_to = _get_reply_to(tw) try: t.urls = [_url['expanded_url'] for _url in tw['entities']['urls']] except KeyError: t.urls = [] try: t.photos = [_img['media_url_https'] for _img in tw['entities']['media'] if _img['type'] == 'photo' and _img['expanded_url'].find('/photo/') != -1] except KeyError: t.photos = [] try: t.video = 1 if len(tw['extended_entities']['media']) else 0 except KeyError: t.video = 0 try: t.thumbnail = tw['extended_entities']['media'][0]['media_url_https'] except KeyError: t.thumbnail = '' t.tweet = getText(tw) t.lang = tw['lang'] try: t.hashtags = [hashtag['text'] for hashtag in tw['entities']['hashtags']] except KeyError: t.hashtags = [] try: t.cashtags = [cashtag['text'] for cashtag in tw['entities']['symbols']] except KeyError: t.cashtags = [] t.replies_count = tw['reply_count'] t.retweets_count = tw['retweet_count'] t.likes_count = tw['favorite_count'] t.link = f"https://twitter.com/{t.username}/status/{t.id}" try: if 'user_rt_id' in tw['retweet_data']: t.retweet = True t.retweet_id = tw['retweet_data']['retweet_id'] t.retweet_date = tw['retweet_data']['retweet_date'] t.user_rt = tw['retweet_data']['user_rt'] t.user_rt_id = tw['retweet_data']['user_rt_id'] except KeyError: t.retweet = False t.retweet_id = '' t.retweet_date = '' t.user_rt = '' t.user_rt_id = '' try: t.quote_url = tw['quoted_status_permalink']['expanded'] if tw['is_quote_status'] else '' except KeyError: # means that the quoted tweet have been deleted t.quote_url = 0 t.near = config.Near if config.Near else "" t.geo = config.Geo if config.Geo else "" t.source = config.Source if config.Source else "" t.translate = '' t.trans_src = '' t.trans_dest = '' if config.Translate: try: ts = translator.translate(text=t.tweet, dest=config.TranslateDest) t.translate = ts.text t.trans_src = ts.src t.trans_dest = ts.dest # ref. https://github.com/SuniTheFish/ChainTranslator/blob/master/ChainTranslator/__main__.py#L31 except ValueError as e: logme.debug(__name__ + ':Tweet:translator.translate:' + str(e)) raise Exception("Invalid destination language: {} / Tweet: {}".format(config.TranslateDest, t.tweet)) return t ================================================ FILE: twint/url.py ================================================ import datetime from sys import platform import logging as logme from urllib.parse import urlencode from urllib.parse import quote mobile = "https://mobile.twitter.com" base = "https://api.twitter.com/2/search/adaptive.json" def _sanitizeQuery(_url, params): _serialQuery = "" _serialQuery = urlencode(params, quote_via=quote) _serialQuery = _url + "?" + _serialQuery return _serialQuery def _formatDate(date): if "win" in platform: return f'\"{date.split()[0]}\"' try: return int(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S").timestamp()) except ValueError: return int(datetime.datetime.strptime(date, "%Y-%m-%d").timestamp()) async def Favorites(username, init): logme.debug(__name__ + ':Favorites') url = f"{mobile}/{username}/favorites?lang=en" if init != '-1': url += f"&max_id={init}" return url async def Followers(username, init): logme.debug(__name__ + ':Followers') url = f"{mobile}/{username}/followers?lang=en" if init != '-1': url += f"&cursor={init}" return url async def Following(username, init): logme.debug(__name__ + ':Following') url = f"{mobile}/{username}/following?lang=en" if init != '-1': url += f"&cursor={init}" return url async def MobileProfile(username, init): logme.debug(__name__ + ':MobileProfile') url = f"{mobile}/{username}?lang=en" if init != '-1': url += f"&max_id={init}" return url async def Search(config, init): logme.debug(__name__ + ':Search') url = base tweet_count = 100 q = "" params = [ # ('include_blocking', '1'), # ('include_blocked_by', '1'), # ('include_followed_by', '1'), # ('include_want_retweets', '1'), # ('include_mute_edge', '1'), # ('include_can_dm', '1'), ('include_can_media_tag', '1'), # ('skip_status', '1'), # ('include_cards', '1'), ('include_ext_alt_text', 'true'), ('include_quote_count', 'true'), ('include_reply_count', '1'), ('tweet_mode', 'extended'), ('include_entities', 'true'), ('include_user_entities', 'true'), ('include_ext_media_availability', 'true'), ('send_error_codes', 'true'), ('simple_quoted_tweet', 'true'), ('count', tweet_count), # ('query_source', 'typed_query'), # ('pc', '1'), ('cursor', str(init)), ('spelling_corrections', '1'), ('ext', 'mediaStats%2ChighlightedLabel'), ('tweet_search_mode', 'live'), # this can be handled better, maybe take an argument and set it then ] if not config.Popular_tweets: params.append(('f', 'tweets')) if config.Lang: params.append(("l", config.Lang)) params.append(("lang", "en")) if config.Query: q += f" from:{config.Query}" if config.Username: q += f" from:{config.Username}" if config.Geo: config.Geo = config.Geo.replace(" ", "") q += f" geocode:{config.Geo}" if config.Search: q += f" {config.Search}" if config.Year: q += f" until:{config.Year}-1-1" if config.Since: q += f" since:{_formatDate(config.Since)}" if config.Until: q += f" until:{_formatDate(config.Until)}" if config.Email: q += ' "mail" OR "email" OR' q += ' "gmail" OR "e-mail"' if config.Phone: q += ' "phone" OR "call me" OR "text me"' if config.Verified: q += " filter:verified" if config.To: q += f" to:{config.To}" if config.All: q += f" to:{config.All} OR from:{config.All} OR @{config.All}" if config.Near: q += f' near:"{config.Near}"' if config.Images: q += " filter:images" if config.Videos: q += " filter:videos" if config.Media: q += " filter:media" if config.Replies: q += " filter:replies" # although this filter can still be used, but I found it broken in my preliminary testing, needs more testing if config.Native_retweets: q += " filter:nativeretweets" if config.Min_likes: q += f" min_faves:{config.Min_likes}" if config.Min_retweets: q += f" min_retweets:{config.Min_retweets}" if config.Min_replies: q += f" min_replies:{config.Min_replies}" if config.Links == "include": q += " filter:links" elif config.Links == "exclude": q += " exclude:links" if config.Source: q += f" source:\"{config.Source}\"" if config.Members_list: q += f" list:{config.Members_list}" if config.Filter_retweets: q += f" exclude:nativeretweets exclude:retweets" if config.Custom_query: q = config.Custom_query q = q.strip() params.append(("q", q)) _serialQuery = _sanitizeQuery(url, params) return url, params, _serialQuery def SearchProfile(config, init=None): logme.debug(__name__ + ':SearchProfile') _url = 'https://api.twitter.com/2/timeline/profile/{user_id}.json'.format(user_id=config.User_id) tweet_count = 100 params = [ # some of the fields are not required, need to test which ones aren't required ('include_profile_interstitial_type', '1'), ('include_blocking', '1'), ('include_blocked_by', '1'), ('include_followed_by', '1'), ('include_want_retweets', '1'), ('include_mute_edge', '1'), ('include_can_dm', '1'), ('include_can_media_tag', '1'), ('skip_status', '1'), ('cards_platform', 'Web - 12'), ('include_cards', '1'), ('include_ext_alt_text', 'true'), ('include_quote_count', 'true'), ('include_reply_count', '1'), ('tweet_mode', 'extended'), ('include_entities', 'true'), ('include_user_entities', 'true'), ('include_ext_media_color', 'true'), ('include_ext_media_availability', 'true'), ('send_error_codes', 'true'), ('simple_quoted_tweet', 'true'), ('include_tweet_replies', 'true'), ('count', tweet_count), ('ext', 'mediaStats%2ChighlightedLabel'), ] if type(init) == str: params.append(('cursor', str(init))) _serialQuery = _sanitizeQuery(_url, params) return _url, params, _serialQuery ================================================ FILE: twint/user.py ================================================ import datetime import logging as logme class user: type = "user" def __init__(self): pass User_formats = { 'join_date': '%Y-%m-%d', 'join_time': '%H:%M:%S %Z' } # ur object must be a json from the endpoint https://api.twitter.com/graphql def User(ur): logme.debug(__name__ + ':User') if 'data' not in ur and 'user' not in ur['data']: msg = 'malformed json! cannot be parsed to get user data' logme.fatal(msg) raise KeyError(msg) _usr = user() _usr.id = ur['data']['user']['rest_id'] _usr.name = ur['data']['user']['legacy']['name'] _usr.username = ur['data']['user']['legacy']['screen_name'] _usr.bio = ur['data']['user']['legacy']['description'] _usr.location = ur['data']['user']['legacy']['location'] _usr.url = ur['data']['user']['legacy']['url'] # parsing date to user-friendly format _dt = ur['data']['user']['legacy']['created_at'] _dt = datetime.datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y') # date is of the format year, _usr.join_date = _dt.strftime(User_formats['join_date']) _usr.join_time = _dt.strftime(User_formats['join_time']) # :type `int` _usr.tweets = int(ur['data']['user']['legacy']['statuses_count']) _usr.following = int(ur['data']['user']['legacy']['friends_count']) _usr.followers = int(ur['data']['user']['legacy']['followers_count']) _usr.likes = int(ur['data']['user']['legacy']['favourites_count']) _usr.media_count = int(ur['data']['user']['legacy']['media_count']) _usr.is_private = ur['data']['user']['legacy']['protected'] _usr.is_verified = ur['data']['user']['legacy']['verified'] _usr.avatar = ur['data']['user']['legacy']['profile_image_url_https'] _usr.background_image = ur['data']['user']['legacy']['profile_banner_url'] # TODO : future implementation # legacy_extended_profile is also available in some cases which can be used to get DOB of user return _usr ================================================ FILE: twint/verbose.py ================================================ def Count(count, config): msg = "[+] Finished: Successfully collected " if config.Followers: msg += f"all {count} users who follow @{config.Username}" elif config.Following: msg += f"all {count} users who @{config.Username} follows" elif config.Favorites: msg += f"{count} Tweets that @{config.Username} liked" else: msg += f"{count} Tweets" if config.Username: msg += f" from @{config.Username}" msg += "." print(msg) def Elastic(elasticsearch): if elasticsearch: print("[+] Indexing to Elasticsearch @ " + str(elasticsearch))