[
  {
    "path": ".github/workflows/jobs.yml",
    "content": "on:\n  push:\n    tags:\n      - '*'\n\njobs:\n  build:\n    runs-on: ubuntu-latest\n\n    steps:\n      - name: Set up QEMU\n        uses: docker/setup-qemu-action@v3\n\n      - name: Set up Docker Buildx\n        uses: docker/setup-buildx-action@v3\n\n      - name: Login to Docker Hub\n        uses: docker/login-action@v3\n        with:\n          username: ${{ secrets.DOCKER_USERNAME }}\n          password: ${{ secrets.DOCKER_PASSWORD }}\n\n      - name: Build and push\n        uses: docker/build-push-action@v6\n        with:\n          platforms: linux/amd64,linux/arm64\n          push: true\n          tags: tonypai/summary-gpt-bot:${{ github.ref_name }}\n"
  },
  {
    "path": "Dockerfile",
    "content": "FROM debian:11-slim AS build\n\nRUN apt-get update && \\\n    apt-get install --no-install-suggests --no-install-recommends --yes python3-venv gcc libpython3-dev && \\\n    python3 -m venv /venv && \\\n    /venv/bin/pip install --upgrade pip setuptools wheel\n\nFROM build AS build-venv\n\nCOPY requirements.txt /requirements.txt\nRUN /venv/bin/pip install --disable-pip-version-check -r /requirements.txt\n\nFROM gcr.io/distroless/python3-debian11:nonroot\n\nWORKDIR /app\nCOPY --from=build-venv /venv /venv\nCOPY main.py .\n\nENV PYTHONUNBUFFERED=1\n\nENTRYPOINT [\"/venv/bin/python3\", \"-u\", \"main.py\"]\n"
  },
  {
    "path": "README.md",
    "content": "# Summary GPT Bot\n\nAn AI-powered text summarization Telegram bot that generates concise summaries of text, URLs, PDFs and YouTube videos.\n\n## Features\n\n- Supports text\n- Supports URLs\n- Supports PDFs\n- Supports YouTube videos (no support for YouTube Shorts)\n\n## Usage\n\nLaunch a OpenAI GPT-4 summary bot that only can be used by your friends and you.\n\n```sh\ndocker run -d \\\n    -e LLM_MODEL=gpt-4 \\\n    -e OPENAI_API_KEY=$OPENAI_API_KEY \\\n    -e TELEGRAM_TOKEN=$YOUR_TG_TOKEN \\\n    -e TS_LANG=$YOUR_LANGUAGE \\\n    -e ALLOWED_USERS=<friend1_id>,<friend2_id>,<your_id> \\\n    tonypai/summary-gpt-bot:latest\n```\n\nLaunch a summary bot using Azure OpenAI.\n\n```sh\ndocker run -d \\\n    -e AZURE_API_BASE=https://<your_azure_resource_name>.openai.azure.com \\\n    -e AZURE_API_KEY=$AZURE_API_KEY \\\n    -e AZURE_API_VERSION=2024-02-15-preview \\\n    -e LLM_MODEL=azure/<your_deployment_name> \\\n    -e TELEGRAM_TOKEN=$YOUR_TG_TOKEN \\\n    -e TS_LANG=$YOUR_LANGUAGE \\\n    tonypai/summary-gpt-bot:latest\n```\n\nLLM Variables\n\n| Environment Variable | Description |\n|----------------------|-------------|\n| AZURE_API_BASE       | API URL base for AZURE OpenAI API |\n| AZURE_API_KEY        | API key for AZURE OpenAI API |\n| AZURE_API_VERSION    | API version for AZURE OpenAI API |\n| OPENAI_API_KEY       | API key for OpenAI API |\n\nBot Variables\n\n| Environment Variable | Description |\n|----------------------|-------------|\n| CHUNK_SIZE           | The maximum token of a chunk when receiving a large input (default: 10000) |\n| LLM_MODEL            | LLM Model to use for text summarization (default: gpt-3.5-turbo-16k) |\n| TELEGRAM_TOKEN       | Token for Telegram API (required) |\n| TS_LANG              | Language of the text to be summarized (default: Taiwanese Mandarin) |\n| DDG_REGION           | The region of the duckduckgo search (default: wt-wt) 👉[Regions](https://github.com/deedy5/duckduckgo_search#regions) |\n| ALLOWED_USERS        | A list of user IDs allowed to use. Asking @myidbot for Telegram ID (optional) |\n"
  },
  {
    "path": "main.py",
    "content": "import asyncio\nimport os\nimport re\nimport trafilatura\nfrom litellm import completion\nfrom duckduckgo_search import AsyncDDGS\nfrom PyPDF2 import PdfReader\nfrom concurrent.futures import ThreadPoolExecutor\nfrom tqdm import tqdm\nfrom telegram import InlineKeyboardButton, InlineKeyboardMarkup\nfrom telegram.ext import CommandHandler, MessageHandler, CallbackQueryHandler, filters, ApplicationBuilder\nfrom youtube_transcript_api import YouTubeTranscriptApi\n\ntelegram_token = os.environ.get(\"TELEGRAM_TOKEN\", \"xxx\")\nmodel = os.environ.get(\"LLM_MODEL\", \"gpt-3.5-turbo-16k\")\nlang = os.environ.get(\"TS_LANG\", \"Taiwanese Mandarin\")\nddg_region = os.environ.get(\"DDG_REGION\", \"wt-wt\")\nchunk_size = int(os.environ.get(\"CHUNK_SIZE\", 10000))\nallowed_users = os.environ.get(\"ALLOWED_USERS\", \"\")\n\ndef split_user_input(text):\n    # Split the input text into paragraphs\n    paragraphs = text.split('\\n')\n\n    # Remove empty paragraphs and trim whitespace\n    paragraphs = [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]\n\n    return paragraphs\n\ndef scrape_text_from_url(url):\n    \"\"\"\n    Scrape the content from the URL\n    \"\"\"\n    try:\n        downloaded = trafilatura.fetch_url(url)\n        text = trafilatura.extract(downloaded, include_formatting=True)\n        if text is None:\n            return []\n        text_chunks = text.split(\"\\n\")\n        article_content = [text for text in text_chunks if text]\n        return article_content\n    except Exception as e:\n        print(f\"Error: {e}\")\n\nasync def search_results(keywords):\n    print(keywords, ddg_region)\n    results = await AsyncDDGS().text(keywords, region=ddg_region, safesearch='off', max_results=3)\n    return results\n\ndef summarize(text_array):\n    \"\"\"\n    Summarize the text using GPT API\n    \"\"\"\n\n    def create_chunks(paragraphs):\n        chunks = []\n        chunk = ''\n        for paragraph in paragraphs:\n            if len(chunk) + len(paragraph) < chunk_size:\n                chunk += paragraph + ' '\n            else:\n                chunks.append(chunk.strip())\n                chunk = paragraph + ' '\n        if chunk:\n            chunks.append(chunk.strip())\n        return chunks\n\n    try:\n        text_chunks = create_chunks(text_array)\n        text_chunks = [chunk for chunk in text_chunks if chunk] # Remove empty chunks\n\n        # Call the GPT API in parallel to summarize the text chunks\n        summaries = []\n        system_messages = [\n            {\"role\": \"system\", \"content\": \"You are an expert in creating summaries that capture the main points and key details.\"},\n            {\"role\": \"system\", \"content\": f\"You will show the bulleted list content without translate any technical terms.\"},\n            {\"role\": \"system\", \"content\": f\"You will print all the content in {lang}.\"},\n        ]\n        with ThreadPoolExecutor() as executor:\n            futures = [executor.submit(call_gpt_api, f\"Summary keypoints for the following text:\\n{chunk}\", system_messages) for chunk in text_chunks]\n            for future in tqdm(futures, total=len(text_chunks), desc=\"Summarizing\"):\n                summaries.append(future.result())\n\n        if len(summaries) <= 5:\n            summary = ' '.join(summaries)\n            with tqdm(total=1, desc=\"Final summarization\") as progress_bar:\n                final_summary = call_gpt_api(f\"Create a bulleted list using {lang} to show the key points of the following text:\\n{summary}\", system_messages)\n                progress_bar.update(1)\n            return final_summary\n        else:\n            return summarize(summaries)\n    except Exception as e:\n        print(f\"Error: {e}\")\n        return \"Unknown error! Please contact the developer.\"\n\ndef extract_youtube_transcript(youtube_url):\n    try:\n        video_id_match = re.search(r\"(?<=v=)[^&]+|(?<=youtu.be/)[^?|\\n]+\", youtube_url)\n        video_id = video_id_match.group(0) if video_id_match else None\n        if video_id is None:\n            return \"no transcript\"\n        ytt_api = YouTubeTranscriptApi()\n        transcript_list = ytt_api.list(video_id)\n        transcript = transcript_list.find_transcript(['en', 'ja', 'ko', 'de', 'fr', 'ru', 'it', 'es', 'pl', 'uk', 'nl', 'zh-TW', 'zh-CN', 'zh-Hant', 'zh-Hans'])\n        transcript_text = ' '.join([item.text for item in transcript.fetch()])\n        return transcript_text\n    except Exception as e:\n        print(f\"Error: {e}\")\n        return \"no transcript\"\n\ndef retrieve_yt_transcript_from_url(youtube_url):\n    output = extract_youtube_transcript(youtube_url)\n    if output == 'no transcript':\n        raise ValueError(\"There's no valid transcript in this video.\")\n    # Split output into an array based on the end of the sentence (like a dot),\n    # but each chunk should be smaller than chunk_size\n    output_sentences = output.split(' ')\n    output_chunks = []\n    current_chunk = \"\"\n\n    for sentence in output_sentences:\n        if len(current_chunk) + len(sentence) + 1 <= chunk_size:\n            current_chunk += sentence + ' '\n        else:\n            output_chunks.append(current_chunk.strip())\n            current_chunk = sentence + ' '\n\n    if current_chunk:\n        output_chunks.append(current_chunk.strip())\n    return output_chunks\n\ndef call_gpt_api(prompt, additional_messages=[]):\n    \"\"\"\n    Call GPT API\n    \"\"\"\n    try:\n        response = completion(\n        # response = openai.ChatCompletion.create(\n            model=model,\n            messages=additional_messages+[\n                {\"role\": \"user\", \"content\": prompt}\n            ],\n\n        )\n        message = response.choices[0].message.content.strip()\n        return message\n    except Exception as e:\n        print(f\"Error: {e}\")\n        return \"\"\n\ndef handle_start(update, context):\n    return handle('start', update, context)\n\ndef handle_help(update, context):\n    return handle('help', update, context)\n\ndef handle_summarize(update, context):\n    return handle('summarize', update, context)\n\ndef handle_file(update, context):\n    return handle('file', update, context)\n\ndef handle_button_click(update, context):\n    return handle('button_click', update, context)\n\nasync def handle(command, update, context):\n    chat_id = update.effective_chat.id\n    print(\"chat_id=\", chat_id)\n\n    if allowed_users:\n        user_ids = allowed_users.split(',')\n        if str(chat_id) not in user_ids:\n            print(chat_id, \"is not allowed.\")\n            await context.bot.send_message(chat_id=chat_id, text=\"You have no permission to use this bot.\")\n            return\n\n    try:\n        if command == 'start':\n            await context.bot.send_message(chat_id=chat_id, text=\"I can summarize text, URLs, PDFs and YouTube video for you.\")\n        elif command == 'help':\n            await context.bot.send_message(chat_id=chat_id, text=\"Report bugs here 👉 https://github.com/tpai/summary-gpt-bot/issues\", disable_web_page_preview=True)\n        elif command == 'summarize':\n            user_input = update.message.text\n            print(\"user_input=\", user_input)\n\n            text_array = process_user_input(user_input)\n            print(text_array)\n\n            if not text_array:\n                raise ValueError(\"No content found to summarize.\")\n\n            await context.bot.send_chat_action(chat_id=chat_id, action=\"TYPING\")\n            summary = summarize(text_array)\n            await context.bot.send_message(chat_id=chat_id, text=f\"{summary}\", reply_to_message_id=update.message.message_id, reply_markup=get_inline_keyboard_buttons())\n        elif command == 'file':\n            file_path = f\"{update.message.document.file_unique_id}.pdf\"\n            print(\"file_path=\", file_path)\n\n            file = await context.bot.get_file(update.message.document)\n            await file.download_to_drive(file_path)\n\n            text_array = []\n            reader = PdfReader(file_path)\n            for page_num in range(len(reader.pages)):\n                page = reader.pages[page_num]\n                text = page.extract_text()\n                text_array.append(text)\n\n            await context.bot.send_chat_action(chat_id=chat_id, action=\"TYPING\")\n            summary = summarize(text_array)\n            await context.bot.send_message(chat_id=chat_id, text=f\"{summary}\", reply_to_message_id=update.message.message_id, reply_markup=get_inline_keyboard_buttons())\n\n            # remove temp file after sending message\n            os.remove(file_path)\n        elif command == 'button_click':\n            original_message_text = update.callback_query.message.text\n            await context.bot.send_chat_action(chat_id=chat_id, action=\"TYPING\")\n\n            if update.callback_query.data == \"explore_similar\":\n                keywords = call_gpt_api(f\"{original_message_text}\\nBased on the content above, give me the top 5 important keywords with commas.\", [\n                    {\"role\": \"system\", \"content\": f\"You will print keywords only.\"}\n                ])\n\n                tasks = [search_results(keywords)]\n                results = await asyncio.gather(*tasks)\n                print(results)\n\n                links = ''\n                for r in results[0]:\n                    links += f\"{r['title']}\\n{r['href']}\\n\"\n\n                await context.bot.send_message(chat_id=chat_id, text=links, reply_to_message_id=update.callback_query.message.message_id, disable_web_page_preview=True)\n\n            if update.callback_query.data == \"why_it_matters\":\n                result = call_gpt_api(f\"{original_message_text}\\nBased on the content above, tell me why it matters as an expert.\", [\n                    {\"role\": \"system\", \"content\": f\"You will show the result in {lang}.\"}\n                ])\n                await context.bot.send_message(chat_id=chat_id, text=result, reply_to_message_id=update.callback_query.message.message_id)\n    except Exception as e:\n        print(f\"Error: {e}\")\n        await context.bot.send_message(chat_id=chat_id, text=str(e))\n\n\ndef process_user_input(user_input):\n    youtube_pattern = re.compile(r\"https?://(www\\.|m\\.)?(youtube\\.com|youtu\\.be)/\")\n    url_pattern = re.compile(r\"https?://\")\n\n    if youtube_pattern.match(user_input):\n        text_array = retrieve_yt_transcript_from_url(user_input)\n    elif url_pattern.match(user_input):\n        text_array = scrape_text_from_url(user_input)\n    else:\n        text_array = split_user_input(user_input)\n\n    return text_array\n\ndef get_inline_keyboard_buttons():\n    keyboard = [\n        [InlineKeyboardButton(\"Explore Similar\", callback_data=\"explore_similar\")],\n        [InlineKeyboardButton(\"Why It Matters\", callback_data=\"why_it_matters\")],\n    ]\n    return InlineKeyboardMarkup(keyboard)\n\ndef main():\n    try:\n        application = ApplicationBuilder().token(telegram_token).build()\n        start_handler = CommandHandler('start', handle_start)\n        help_handler = CommandHandler('help', handle_help)\n        summarize_handler = MessageHandler(filters.TEXT & ~filters.COMMAND, handle_summarize)\n        file_handler = MessageHandler(filters.Document.PDF, handle_file)\n        button_click_handler = CallbackQueryHandler(handle_button_click)\n        application.add_handler(file_handler)\n        application.add_handler(start_handler)\n        application.add_handler(help_handler)\n        application.add_handler(summarize_handler)\n        application.add_handler(button_click_handler)\n        application.run_polling()\n    except Exception as e:\n        print(e)\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "requirements.txt",
    "content": "# async handler\nasyncio==3.4.3\n\n# progress tracking\ntqdm==4.66.4\n\n# llm adapter\nlitellm==1.37.9\n\n# text extraction\ntrafilatura==1.9.0 \n\n# duckduckgo\nduckduckgo_search==5.3.0b4\n\n# PDFs\nPyPDF2==3.0.1\n\n# Telegram\npython-telegram-bot==21.1.1\n\n# YouTube\nyoutube_transcript_api==1.2.2\n"
  }
]