Repository: mehranshakarami/AI_Spectrum
Branch: main
Commit: 88cd3d85f6a6
Files: 29
Total size: 29.4 KB

Directory structure:
gitextract_k_b5pfd1/

├── .gitattributes
├── .gitignore
├── 2021/
│   └── Twitter_API/
│       ├── config.ini
│       └── twitter_api.py
├── 2022/
│   ├── Math_plotter/
│   │   ├── math_plotter.py
│   │   └── mathplotter/
│   │       ├── __init__.py
│   │       ├── click_and_crop.py
│   │       ├── latexPlotter.py
│   │       ├── readEquations.py
│   │       └── utils.py
│   ├── Sentiment_Analysis/
│   │   └── tw-sentiment.py
│   ├── Twitter_API/
│   │   ├── config.ini
│   │   ├── twitter_data_search.py
│   │   ├── twitter_data_stream.py
│   │   └── twitter_data_users.py
│   ├── Web_Scraping/
│   │   └── bs-amazon.py
│   └── snscrape/
│       └── tweets.py
├── 2024/
│   ├── Multi-lingual sentiment analysis/
│   │   ├── main.py
│   │   ├── readme.md
│   │   ├── requirements.txt
│   │   ├── sentiment.py
│   │   ├── test_labels.csv
│   │   ├── translate.py
│   │   └── tweets.csv
│   └── Twikit/
│       ├── config.ini
│       ├── main.py
│       ├── readme.md
│       └── tweets.csv
└── README.md

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitattributes
================================================
# Auto detect text files and perform LF normalization
* text=auto


================================================
FILE: .gitignore
================================================
# Mac folder
*DS_Store

================================================
FILE: 2021/Twitter_API/config.ini
================================================
[twitter]

api_key = 
api_key_secret = 

access_token = 
access_token_secret = 

================================================
FILE: 2021/Twitter_API/twitter_api.py
================================================
import tweepy
import configparser
import pandas as pd

# read configs
config = configparser.ConfigParser()
config.read('config.ini')

api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']

access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

# authentication
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

public_tweets = api.home_timeline()

# create dataframe
columns = ['Time', 'User', 'Tweet']
data = []
for tweet in public_tweets:
    data.append([tweet.created_at, tweet.user.screen_name, tweet.text])

df = pd.DataFrame(data, columns=columns)

df.to_csv('tweets.csv')

================================================
FILE: 2022/Math_plotter/math_plotter.py
================================================
from turtle import width

from matplotlib.pyplot import axis
from mathreader.api import *
from mathreader.config import Configuration
from mathreader.helpers.exceptions import *
import base64
import numpy as np
import cv2
import sys
from PIL import ImageGrab
from time import sleep
from mathplotter.readEquations import find_equations, frame_change
from mathplotter.click_and_crop import image_crop, add_text
import matplotlib.pyplot as plt
from mathplotter.latexPlotter import plot_eq
import mathplotter.utils as utils
import imutils


# import colors for plot
colors = utils.plotColors()
cv2_color = colors[0]
plt_color = colors[1]


def hmp(cam=0, width=500, new_back=True):

    # plots inits
    # plt.ion()
    fig = plt.figure(figsize=(8, 5), tight_layout=True)
    ax = fig.gca()
    plt.pause(0.0001)

    configs = Configuration()
    hme_recognizer = HME_Recognizer()

    cap = cv2.VideoCapture(cam)

    if not cap.isOpened():
        raise IOError("Cannot open webcam")

    # figs preparations
    if new_back:
        back, crop_box = image_crop(cam=cam, wind_name="background", width=width)
        cv2.imwrite("frame_background.jpg", back)
        with open("crop_box.txt", "w") as f:
            for line in crop_box:
                f.write(str(line))
                f.write("\n")

    else:
        back = cv2.imread("frame_background.jpg")
        crop_box = []
        with open("crop_box.txt", "r") as f:
            lines = f.readlines()

        for line in lines:
            crop_box.append(int(line))

    x0, x1, y0, y1 = crop_box


    frame_old = back.copy()

    # inits
    eq_old = None

    utils.initializeTrackbars()
    ocrVal = False
    # main loop
    while True:
        equations = []
        # equations_parser = []
        gotNewEquation = False
        
        if utils.valTrackbars()[-1] == 0: 
            ocrVal = False 
        
        _, frame = cap.read()
        frame = imutils.resize(frame, width=width)
        frame = frame[x0:x1, y0:y1]

        frameBW, equations_imgs, bboxes = find_equations(frame, back)

        frameBW_BGR = cv2.cvtColor(
            frameBW,
            cv2.COLOR_GRAY2BGR,
        )

        try:
            if bboxes:
                for idx, bbox in enumerate(bboxes):
                    x, y, w, h = bbox
                    cv2.rectangle(
                        frameBW_BGR, (x, y), (x + w, y + h), cv2_color[idx], 4
                    )
                    cv2.imshow("pic", frameBW_BGR)
            else:
                cv2.imshow("pic", frameBW_BGR)
        except:
            cv2.imshow("pic", frameBW_BGR)

        if cv2.waitKey(1) & 0xFF == 27:
            break  # esc to quit

        if ( ocrVal == False and
            utils.valTrackbars()[-1] == 1
        ):  # if OCR == 1

            ocrVal = True
            
            for idx, eq in enumerate(equations_imgs):

                cv2.imwrite("eq.png", eq)
                hme_recognizer.load_image("eq.png", data_type="path")

                try:
                    
                    proc_img = frameBW_BGR.copy()
                    add_text(proc_img, "Detecting")
                    cv2.imshow("pic", proc_img)
                    print("Detecting")
                    expression, img = hme_recognizer.recognize()
                    # expression_parsed = hme_recognizer.expression_after_parser
                    print("Latex: ", expression)
                    if "=" in expression:

                        equations.append(expression)
                        # equations_parser.append(expression_parsed)
                        eq_old = frameBW.copy()
                        gotNewEquation = True
                    
                except:
                    pass

            if gotNewEquation:

                try:
                    print(equations)
                    ax, fig = plot_eq(equations, ax, fig)
                    plt.pause(0.0001)
                except Exception as e:
                    print(e)
            # if new_eq is None:

        frame_old = frame.copy()
        sleep(0.1)
        
    cap.release()
    cv2.destroyAllWindows()


if __name__ == "__main__":
    hmp(cam=1, width=None, new_back=False)

    
================================================
FILE: 2022/Math_plotter/mathplotter/__init__.py
================================================


================================================
FILE: 2022/Math_plotter/mathplotter/click_and_crop.py
================================================
# import the necessary packages
import argparse
import cv2
import imutils

# initialize the list of reference points and boolean indicating
# whether cropping is being performed or not
refPt = []
cropping = False
sel_rect_endpoint = []
image = []
Lclick = False
Rclick = False


def add_text(img, text):
    font = cv2.FONT_HERSHEY_TRIPLEX
    font_size = 0.7
    font_color = (130, 3, 3)
    font_thickness = 1
    x, y = 15, 105

    return cv2.putText(
        img,
        text,
        (x, y),
        font,
        font_size,
        font_color,
        font_thickness,
        cv2.LINE_AA,
    )


def click_and_crop(event, x, y, flags, param):
    # grab references to the global variables
    global refPt, cropping, sel_rect_endpoint, image, Lclick, Rclick
    wind_name = param
    # image = param
    # if the left mouse button was clicked, record the starting
    # (x, y) coordinates and indicate that cropping is being
    # performed
    if event == cv2.EVENT_LBUTTONDOWN:
        refPt = [[x, y]]
        Lclick = True
        # refPt = (min(ix,x), min(iy,y), abs(ix-x), abs(iy-y)) #set bounding box by mouse move
        cropping = True
    elif event == cv2.EVENT_MOUSEMOVE and cropping:
        sel_rect_endpoint = [[x, y]]
    # check to see if the left mouse button was released
    elif event == cv2.EVENT_LBUTTONUP:
        # record the ending (x, y) coordinates and indicate that
        # the cropping operation is finished
        refPt.append([x, y])
        cropping = False
        Rclick = True
        # draw a rectangle around the region of interest
        cv2.rectangle(image, refPt[0], refPt[1], (0, 255, 0), 2)
        cv2.imshow(wind_name, image)


def image_crop(cam=0, wind_name="image", width=500):
    captured = False
    cap = cv2.VideoCapture(cam)
    global refPt, cropping, sel_rect_endpoint, image

    while True:
        _, image = cap.read()
        image = imutils.resize(image, width=width)
        text = '"c": Capture, "Esc": Quit'
        image_text = image.copy()
        image_text = add_text(image_text, text)
        cv2.imshow(wind_name, image_text)
        key = cv2.waitKey(1) & 0xFF

        if key == ord("c"):
            captured = True
            break

        elif key == 27:
            break  # esc to quit

    clone = image.copy()
    cv2.namedWindow(wind_name)
    cv2.setMouseCallback(wind_name, click_and_crop, (wind_name))

    # keep looping until the 'q' key is pressed
    while captured:
        # display the image and wait for a keypress
        # cv2.imshow(wind_name, image)

        if not cropping and not Rclick and not Lclick:

            text = 'Draw a Box with Mouse, or "Esc": Quit'
            image_text = image.copy()
            image_text = add_text(image_text, text)
            cv2.imshow(wind_name, image_text)
            key = cv2.waitKey(1) & 0xFF

            if key == 27:
                return None  # esc to quit

        elif cropping and sel_rect_endpoint:

            rect_cpy = image.copy()
            cv2.rectangle(rect_cpy, refPt[0], sel_rect_endpoint[0], (0, 255, 0), 1)
            cv2.imshow(wind_name, rect_cpy)

        elif Rclick and Lclick:
            text = '"c": Crop, "r": Reset Box, "Esc": Quit'
            image_text = image.copy()
            image_text = add_text(image_text, text)
            cv2.imshow(wind_name, image_text)

        key = cv2.waitKey(1) & 0xFF
        # if the 'r' key is pressed, reset the cropping region
        if key == ord("r"):

            image = clone.copy()
            cv2.imshow(wind_name, image)
            refPt = []
            cropping = False
            sel_rect_endpoint = []

        # if the 'c' key is pressed, break from the loop
        elif key == ord("c"):
            break

        elif key == 27:
            return None
    # if there are two reference points, then crop the region of interest
    # crop the image

    if len(refPt) == 2:
        crop_box = [
            min(refPt[0][1], refPt[1][1]),
            max(refPt[0][1], refPt[1][1]),
            min(refPt[0][0], refPt[1][0]),
            max(refPt[0][0], refPt[1][0]),
        ]
        roi = clone[crop_box[0] : crop_box[1], crop_box[2] : crop_box[3]]

        # close all open windows
        # cv2.destroyAllWindows()
        cv2.destroyWindow(wind_name)
        return roi, crop_box


if __name__ == "__main__":
    cam = 0
    img_cropped, crop_box = image_crop(cam=cam, wind_name="background")
    cv2.imshow("background", img_cropped)
    cv2.waitKey(0)
    # cv2.imwrite("background.jpg", img_cropped)
    cap = cv2.VideoCapture(cam)
    _, frame = cap.read()
    x0, x1, y0, y1 = crop_box
    frame_cropped = frame[x0:x1, y0:y1]
    cv2.imshow("frame", frame_cropped)
    cv2.waitKey(0)
    # cv2.imwrite("frame.jpg", frame_cropped)


================================================
FILE: 2022/Math_plotter/mathplotter/latexPlotter.py
================================================
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib

import matplotlib.pyplot as plt
from matplotlib import cm
import re
from time import sleep
from sympy import numer
import sympy
from sympy.parsing.latex import parse_latex
from sympy.plotting import plot, plot3d

# plt.rcParams.update({"text.usetex": True, "xtick.labelsize": 16, "ytick.labelsize": 16})
plt_color = ["#0025b8", "#820303", "#02630f", "#460263", "#018c75"]


def plot_eq(equations, ax, fig):

    ax.clear()

    ax, fig = check_axis(equations[0], ax, fig)

    for idx, eq in enumerate(equations):
   
        eq = eq.replace("\\cdot", "*")
        if "z" in eq:
            dim = 3
            eq = eq.replace("z", "")
            eq = eq.replace("=", "")
        else:
            dim = 2
            eq = eq.replace("y", "")
            eq = eq.replace("=", "")

        
        # plot equations
        sympy_eq = parse_latex(eq)

        
        if dim == 2:
            p = plot(sympy_eq,show=False)
            eq_latex = (
                r"$y="
                + sympy.latex(sympy_eq)
                + "$"
            )
            x,y =p[0].get_data()
            ax.plot(
                x,
                y,
                label=eq_latex,
                color=plt_color[idx],
                linewidth=2,
            )
        else:
            p = plot3d(sympy_eq,show=False)
           
            x,y,z =p[0].get_meshes()
            eq_latex = (
                r"$z="
                + sympy.latex(sympy_eq)
                + "$"
            )
            surf = ax.plot_surface(
                x,
                y,
                z,
                label=eq_latex,
                cmap=cm.coolwarm,
                linewidth=0,
                antialiased=False,
            )
            surf._facecolors2d = surf._facecolor3d
            surf._edgecolors2d = surf._edgecolor3d

    ax.legend(fontsize=12)
    plt.pause(0.0001)
    plt.show(block=False)
    plt.pause(0.0001)

    return ax, fig


def check_axis(eq, ax, fig):

    if "z" in eq:
        if ax.name != "3d":
            ax.remove()
            plt.pause(0.0001)
            ax = fig.add_subplot(projection="3d")
            plt.pause(0.0001)
    else:
        if ax.name == "3d":
            ax.remove()
            plt.pause(0.0001)
            ax = fig.add_subplot()
            plt.pause(0.0001)

    return ax, fig


if __name__ == "__main__":
    plt.ion()
    fig = plt.figure(figsize=(8, 5), tight_layout=True)
    ax = fig.gca()
    plt.pause(0.0001)

    equations = ["y=\\sqrt{\\sqrt{x^2}}+\\sqrt{1-x^2}", "y=\\sqrt{\\sqrt{x^2}}-\\sqrt{1-x^2}"]
    ax, fig = plot_eq(equations, ax, fig)
    sleep(2)
    equations = [r"z=x^2-y^2"]
    ax, fig = plot_eq(equations, ax, fig)
    sleep(2)

    # equations = ["z=x+y"]
    # ax, fig = plot_eq(equations, ax, fig)
    # sleep(2)

    # equations = ["y=x^2", "y=2*x"]
    # ax, fig = plot_eq(equations, ax, fig)
    # sleep(2)


================================================
FILE: 2022/Math_plotter/mathplotter/readEquations.py
================================================
import cv2
import numpy as np
from PIL import Image
import mathplotter.utils as utils


colors = utils.plotColors()
cv2_color = colors[0]
plt_color = colors[1]


def to_bw(img):
    img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    # img_blur = cv2.GaussianBlur(img, (21, 21), 0)
    (thresh, img_bw) = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    setThersh = utils.valTrackbars()[3]

    if thresh < setThersh:
        _, img_bw = cv2.threshold(img, 100, 255, cv2.THRESH_BINARY)
    kernel_size = utils.valTrackbars()[0]
    kernel_dilate = np.ones((kernel_size, kernel_size))
    img_dilate = cv2.dilate(img_bw, kernel_dilate, iterations=1)

    return img_dilate


def find_equations(img, back_img):

    subtracted_img = cv2.subtract(back_img, img)
    img_bw = to_bw(subtracted_img)
    img_clean = cv2.bitwise_not(img_bw)

    ker_morph_x, ker_morph_y = utils.valTrackbars()[1:3]
    if ker_morph_x == 0:
        ker_morph_x = 1
    if ker_morph_y == 0:
        ker_morph_y = 1

    kernel_morph = cv2.getStructuringElement(cv2.MORPH_RECT, (ker_morph_x, ker_morph_y))
    img_morph = cv2.morphologyEx(img_bw, cv2.MORPH_DILATE, kernel_morph)

    # ---Finding contours ---
    contours, hierarchy = cv2.findContours(
        img_morph, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE
    )

    equations = []
    box_coords = []

    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)

        box_coords.append([x, y, w, h])
        img_cropped = img_clean[y : y + h, x : x + w]

        equations.append(img_cropped)

    return img_clean, equations, box_coords


def frame_change(new_frame, old_frame):
    delta = cv2.subtract(old_frame, new_frame)
    # check for rgb image
    if len(delta.shape) == 3:
        delta = to_bw(delta)

    return np.linalg.norm(delta) != 0


if __name__ == "__main__":
    from click_and_crop import image_crop
    from time import sleep

    utils.initializeTrackbars()

    back = cv2.imread("background1.jpg")
    frame = cv2.imread("frame1.jpg")
    while True:

        if cv2.waitKey(1) & 0xFF == 27:
            break  # esc to quit

        frameBW, equations, bboxes = find_equations(frame, back)
        frameBW_BGR = cv2.cvtColor(
            frameBW,
            cv2.COLOR_GRAY2BGR,
        )

        try:
            if bboxes:
                for idx, bbox in enumerate(bboxes):
                    x, y, w, h = bbox

                    cv2.rectangle(
                        frameBW_BGR, (x, y), (x + w, y + h), cv2_color[idx], 4
                    )

                    cv2.imshow("pic", frameBW_BGR)
        except:
            cv2.imshow("pic", frameBW_BGR)


================================================
FILE: 2022/Math_plotter/mathplotter/utils.py
================================================
# import opencv and numpy
import cv2
import numpy as np


def plotColors():
    cv2_color = [
        (130, 3, 3),
        (0, 37, 184),
        (2, 99, 15),
        (70, 2, 99),
        (1, 140, 117),
        (138, 145, 1),
        (1, 120, 106),
    ]
    plt_color = [
        "#0025b8",
        "#820303",
        "#02630f",
        "#460263",
        "#018c75",
        "#8a9101",
        "#01786a",
    ]
    return [cv2_color, plt_color]


# trackbar callback fucntion does nothing but required for trackbar
def nothing(x):
    pass


def initializeTrackbars(initVals=[1, 150, 50, 2]):
    # create a seperate window for trackbar
    cv2.namedWindow("trackbars")
    # create trackbars
    cv2.createTrackbar("Cut noise", "trackbars", initVals[3], 20, nothing)
    cv2.createTrackbar("Text width", "trackbars", initVals[0], 10, nothing)
    cv2.createTrackbar("Box width", "trackbars", initVals[1], 300, nothing)
    cv2.createTrackbar("Box hight", "trackbars", initVals[2], 300, nothing)
    cv2.createTrackbar("OCR", "trackbars", 0, 1, nothing)


def valTrackbars():
    textWidth = cv2.getTrackbarPos("Text width", "trackbars")
    boxWidth = cv2.getTrackbarPos("Box width", "trackbars")
    boxHeight = cv2.getTrackbarPos("Box hight", "trackbars")
    cutNoise = cv2.getTrackbarPos("Cut noise", "trackbars")
    ocr = cv2.getTrackbarPos("OCR", "trackbars")

    return [textWidth, boxWidth, boxHeight, cutNoise, ocr]


================================================
FILE: 2022/Sentiment_Analysis/tw-sentiment.py
================================================
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

# tweet = "@MehranShakarami today's cold @ home 😒 https://mehranshakarami.com"
tweet = 'Great content! subscribed 😉'

# precprcess tweet
tweet_words = []

for word in tweet.split(' '):
    if word.startswith('@') and len(word) > 1:
        word = '@user'
    
    elif word.startswith('http'):
        word = "http"
    tweet_words.append(word)

tweet_proc = " ".join(tweet_words)

# load model and tokenizer
roberta = "cardiffnlp/twitter-roberta-base-sentiment"

model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

labels = ['Negative', 'Neutral', 'Positive']

# sentiment analysis
encoded_tweet = tokenizer(tweet_proc, return_tensors='pt')
# output = model(encoded_tweet['input_ids'], encoded_tweet['attention_mask'])
output = model(**encoded_tweet)

scores = output[0][0].detach().numpy()
scores = softmax(scores)

for i in range(len(scores)):
    
    l = labels[i]
    s = scores[i]
    print(l,s)


================================================
FILE: 2022/Twitter_API/config.ini
================================================
[twitter]

api_key = 
api_key_secret = 

access_token = 
access_token_secret = 


================================================
FILE: 2022/Twitter_API/twitter_data_search.py
================================================
import tweepy
import configparser
import pandas as pd

# read configs
config = configparser.ConfigParser()
config.read('config.ini')

api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']

access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

# authentication
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

# user tweets
# user = 'veritasium'
# limit=300

# tweets = tweepy.Cursor(api.user_timeline, screen_name=user, count=200, tweet_mode='extended').items(limit)

# search tweets
keywords = '@veritasium'
limit=300

tweets = tweepy.Cursor(api.search_tweets, q=keywords, count=100, tweet_mode='extended').items(limit)

# tweets = api.user_timeline(screen_name=user, count=limit, tweet_mode='extended')

# create DataFrame
columns = ['User', 'Tweet']
data = []

for tweet in tweets:
    data.append([tweet.user.screen_name, tweet.full_text])

df = pd.DataFrame(data, columns=columns)

print(df)


================================================
FILE: 2022/Twitter_API/twitter_data_stream.py
================================================
import tweepy
import configparser
import pandas as pd


# read configs
config = configparser.ConfigParser()
config.read('config.ini')

api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']

access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

# authentication
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)


class Linstener(tweepy.Stream):

    tweets = []
    limit = 1

    def on_status(self, status):
        self.tweets.append(status)
        # print(status.user.screen_name + ": " + status.text)

        if len(self.tweets) == self.limit:
            self.disconnect()


stream_tweet = Linstener(api_key, api_key_secret, access_token, access_token_secret)

# stream by keywords
# keywords = ['2022', '#python']

# stream_tweet.filter(track=keywords)

# stream by users
users = ['MehranShakarami', 'veritasium']
user_ids = []

for user in users:
    user_ids.append(api.get_user(screen_name=user).id)

stream_tweet.filter(follow=user_ids)

# create DataFrame

columns = ['User', 'Tweet']
data = []

for tweet in stream_tweet.tweets:
    if not tweet.truncated:
        data.append([tweet.user.screen_name, tweet.text])
    else:
        data.append([tweet.user.screen_name, tweet.extended_tweet['full_text']])

df = pd.DataFrame(data, columns=columns)

print(df)

================================================
FILE: 2022/Twitter_API/twitter_data_users.py
================================================
import tweepy
import configparser
import pandas as pd

# read configs
config = configparser.ConfigParser()
config.read('config.ini')

api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']

access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

# authentication
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

# user tweets
user = 'veritasium'
limit=300

tweets = tweepy.Cursor(api.user_timeline, screen_name=user, count=200, tweet_mode='extended').items(limit)

# tweets = api.user_timeline(screen_name=user, count=limit, tweet_mode='extended')

# create DataFrame
columns = ['User', 'Tweet']
data = []

for tweet in tweets:
    data.append([tweet.user.screen_name, tweet.full_text])

df = pd.DataFrame(data, columns=columns)

print(df)


================================================
FILE: 2022/Web_Scraping/bs-amazon.py
================================================
from bs4 import BeautifulSoup
import requests
import csv


# get html
url = "https://www.amazon.com/Best-Sellers-Books/zgbs/books"

# change the user-agent value based on your web browser
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36'}

page = requests.get(url, headers=headers)

soup = BeautifulSoup(page.content, 'html.parser')

# get all books
books = soup.find_all(id="gridItemRoot")

csv_headers = ['Rank', 'Title', 'Author', 'Price']
with open('amazon_books.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(csv_headers)


for book in books:

    rank = book.find('span', class_='zg-bdg-text').text[1:]

    children = book.find('div', class_='zg-grid-general-faceout').div

    title = children.contents[1].text
    author = children.contents[2].text
    price = children.contents[-1].text
   
    with open('amazon_books.csv', 'a', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([rank, title, author, price])


================================================
FILE: 2022/snscrape/tweets.py
================================================
import snscrape.modules.twitter as sntwitter
import pandas as pd

query = "(from:elonmusk) until:2020-01-01 since:2010-01-01"
tweets = []
limit = 5000


for tweet in sntwitter.TwitterSearchScraper(query).get_items():
    
    # print(vars(tweet))
    # break
    if len(tweets) == limit:
        break
    else:
        tweets.append([tweet.date, tweet.username, tweet.content])
        
df = pd.DataFrame(tweets, columns=['Date', 'User', 'Tweet'])
print(df)

# to save to csv
# df.to_csv('tweets.csv')

================================================
FILE: 2024/Multi-lingual sentiment analysis/main.py
================================================
from sentiment import predict_sentiment, ROBERTA_SUPPORTED_LANGUAGES
from translate import translate_text
import csv

def read_tweets(file_path: str) -> list[dict[str, str]]:
    with open(file_path, "r") as file:
        reader = csv.DictReader(file)
        list_of_tweets = list(reader)
    return list_of_tweets

tweets = read_tweets("./tweets.csv")

sentiment_by_id = {}

for tweet in tweets:
    tweet_text, language = tweet["text"], tweet["language"]

    if not (language and language in ROBERTA_SUPPORTED_LANGUAGES):
        translated_text, language = translate_text(tweet_text)

    if language in ROBERTA_SUPPORTED_LANGUAGES:
        sentiment = predict_sentiment(tweet_text)
    else:
        sentiment = predict_sentiment(translated_text)

    sentiment_by_id[tweet["id"]] = sentiment


#/ check the accuracy
test_labels = read_tweets("./test_labels.csv")
correct_predictions = 0

for test in test_labels:
    if sentiment_by_id[test["id"]] == test["label"]:
        correct_predictions += 1

accuracy = correct_predictions / len(test_labels)
print(f"Accuracy: {accuracy:.2f}")

================================================
FILE: 2024/Multi-lingual sentiment analysis/readme.md
================================================
# Multi-lingual tweet sentiment analysis

This code is based on [twitter-XLM-roBERTa-base](https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment)
and Google Translate to perform sentiment analysis on multi-lingual tweets.

Check out the supplementary YouTube tutorial: https://youtu.be/t_A_35m9OzU

================================================
FILE: 2024/Multi-lingual sentiment analysis/requirements.txt
================================================
googletrans==3.1.0a0
transformers==4.44.2
torch==2.4.0
sentencepiece==0.2.0
protobuf==5.28.0

================================================
FILE: 2024/Multi-lingual sentiment analysis/sentiment.py
================================================
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig


MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
ROBERTA_SUPPORTED_LANGUAGES = ('ar', 'en', 'fr', 'de', 'hi', 'it', 'es', 'pt')

model = AutoModelForSequenceClassification.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

#/ save the model locally
model.save_pretrained(MODEL)
tokenizer.save_pretrained(MODEL)


# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def predict_sentiment(text: str) -> str:
    processed_text = preprocess(text)
    encoded_input = tokenizer(processed_text, return_tensors='pt')
    output = model(**encoded_input)
    index_of_sentiment = output.logits.argmax().item()
    sentiment = config.id2label[index_of_sentiment]
    return sentiment


# text = "la pizza da @michele è veramente buona https://www.youtube.com"
# text = "این غذا خیلی شوره!"
# text = "یه جلسه دیگه که میتونست یه ایمیل باشه 🥲"
# print(predict_sentiment(text))

================================================
FILE: 2024/Multi-lingual sentiment analysis/test_labels.csv
================================================
id,label
0,positive
1,negative
2,positive
3,neutral
4,negative
5,neutral
6,positive
7,neutral
8,positive
9,negative
10,neutral
11,positive
12,negative
13,negative

================================================
FILE: 2024/Multi-lingual sentiment analysis/translate.py
================================================
from googletrans import Translator

translator = Translator()


def translate_text(original_text: str) -> str:
    translation = translator.translate(original_text, dest='en')
    translated_text, original_language = translation.text, translation.src
    return translated_text, original_language

# original_text = "این غذا خیلی شوره!"
# print(translate_text(original_text))

================================================
FILE: 2024/Multi-lingual sentiment analysis/tweets.csv
================================================
id,text,language
0,باب الحارة يغدر بيك غدر، يخليك اتفرج علية من الاول بدون ما تفطن😂,
1,مشاكل البلاك بورد متى تنتهي💔؟,
2,@FCAugsburg Der neue Styler der Bilder ist echt genial! Gefällt mir!,de
3,Kein Pardon: UEFA-Ermittlungen gegen Manchester United! #MUFCFCV #MUFC #Manchester https://t.co/PwZFpoqN1p https://t.co/PYfNgyneSu,de
4,"Ach man, ich hasse Gegentore. #S04",de
5,is listening to Jello Biafra at work.,en
6,"@_chloe yes! it`s on youtube  its from may 7th, and it made me feel 100x better. it`s halarious.",en
7,اینا اپلیکیشن فرم ندارن براشون اپلای کنیم؟,fa
8,این ترم هم معدل الف شدم هو هو🤭🤭,fa
9,خب حس میکنم پیر شدم و تا اطلاع ثانوی همه چی کنسله کنسل,fa
10,सर्व धर्म मंदिर के अलावा यहां पर एक संग्रहालय और चिड़ियाघर भी है।,hi
11,"इस पर ब्राउज़िंग करना, गेम्स खेलना और मूवी देखना काफी अच्छा अनुभव है।",hi
12,कम से कम HD स्क्रीन तो देनी चाहिए थी।,hi
13,Great! Another meeting that could be an email.,en

================================================
FILE: 2024/Twikit/config.ini
================================================
[X]
username = xxx
password = xxxxxx
email = xxxx@xxxx.com

================================================
FILE: 2024/Twikit/main.py
================================================
from twikit import Client, TooManyRequests
import time
from datetime import datetime
import csv
from configparser import ConfigParser
from random import randint


MINIMUM_TWEETS = 10
QUERY = '(from:elonmusk) lang:en until:2020-01-01 since:2018-01-01'


def get_tweets(tweets):
    if tweets is None:
        #* get tweets
        print(f'{datetime.now()} - Getting tweets...')
        tweets = client.search_tweet(QUERY, product='Top')
    else:
        wait_time = randint(5, 10)
        print(f'{datetime.now()} - Getting next tweets after {wait_time} seconds ...')
        time.sleep(wait_time)
        tweets = tweets.next()

    return tweets


#* login credentials
config = ConfigParser()
config.read('config.ini')
username = config['X']['username']
email = config['X']['email']
password = config['X']['password']

#* create a csv file
with open('tweets.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Tweet_count', 'Username', 'Text', 'Created At', 'Retweets', 'Likes'])


#* authenticate to X.com
#! 1) use the login credentials. 2) use cookies.
client = Client(language='en-US')
# client.login(auth_info_1=username, auth_info_2=email, password=password)
# client.save_cookies('cookies.json')

client.load_cookies('cookies.json')

tweet_count = 0
tweets = None

while tweet_count < MINIMUM_TWEETS:

    try:
        tweets = get_tweets(tweets)
    except TooManyRequests as e:
        rate_limit_reset = datetime.fromtimestamp(e.rate_limit_reset)
        print(f'{datetime.now()} - Rate limit reached. Waiting until {rate_limit_reset}')
        wait_time = rate_limit_reset - datetime.now()
        time.sleep(wait_time.total_seconds())
        continue

    if not tweets:
        print(f'{datetime.now()} - No more tweets found')
        break

    for tweet in tweets:
        tweet_count += 1
        tweet_data = [tweet_count, tweet.user.name, tweet.text, tweet.created_at, tweet.retweet_count, tweet.favorite_count]
        
        with open('tweets.csv', 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(tweet_data)

    print(f'{datetime.now()} - Got {tweet_count} tweets')


print(f'{datetime.now()} - Done! Got {tweet_count} tweets found')

================================================
FILE: 2024/Twikit/readme.md
================================================
# Scraping `X.com` with Twikit

This code use [Twikit](https://github.com/d60/twikit) to scrape Tweet data. To run the code, use 
`pip install “twikit==1.7.6”` to install the _twikit_ package. The latest update of _twikit_ 
deprecated the synchronous method which is used in the code.

Check out the supplementary YouTube tutorial: https://youtu.be/6D6fVyFQD5A

================================================
FILE: 2024/Twikit/tweets.csv
================================================
Tweet_count,Username,Text,Created At,Retweets,Likes


================================================
FILE: README.md
================================================
# AI_Spectrum