[
  {
    "path": ".gitattributes",
    "content": "# Auto detect text files and perform LF normalization\n* text=auto\n"
  },
  {
    "path": ".gitignore",
    "content": ".env\nenv/"
  },
  {
    "path": "README.md",
    "content": "# 7 Days of LangChain\n Code repo for my \"7 Days of LangChain\" series. The repo is still quite messy, but I'll be fixing this the coming week.\n\n# How to start?\nClone the repo or copy the code snippets you'd like to use.\nRun \n\n<code>pip install -r requirements.txt</code>\n\n# Want to know more about the code?\n\nGo and follow me on [Twitter](https://twitter.com/JorisTechTalk) for more details on the code!"
  },
  {
    "path": "day_1/requirements.txt",
    "content": "langchain\nopenai\nyoutube-transcript-api\npytube\ntiktoken\nbs4"
  },
  {
    "path": "day_1/yt_to_strategy.py",
    "content": "\"\"\"\nThis script shows how to create a strategy for a four-hour workday based on a YouTube video.\nWe're using an easy LangChain implementation to show how to use the different components of LangChain.\nThis is part of my '7 Days of LangChain' series. \n\nCheck out the explanation about the code on my Twitter (@JorisTechTalk)\n\n\"\"\"\n\n\nfrom langchain import LLMChain\nfrom langchain.document_loaders import YoutubeLoader\nfrom langchain.text_splitter import TokenTextSplitter\nfrom langchain.chat_models import ChatOpenAI\nfrom langchain.prompts import PromptTemplate\nfrom langchain.chains.summarize import load_summarize_chain\nfrom langchain.callbacks import get_openai_callback\n\nwith get_openai_callback() as cb:\n\n    # Set your OpenAI API Key.\n    openai_api_key = 'YOUR_API_KEY_HERE'\n\n    # Load a youtube video and get the transcript\n    url = \"https://www.youtube.com/watch?v=aV4jKPFOjvk\"\n    loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)\n    data = loader.load()\n\n    # Split the transcript into shorter chunks.\n    # First create the text splitter. The chunk_size is the maximum number of tokens in each chunk.\n    # With the new gpt-3.5-turbo-16k model, you actually don't need it in this example, but it's good to know how to do it.\n    text_splitter = TokenTextSplitter(chunk_size = 5000, chunk_overlap = 100)\n\n    # Then split the transcript into chunks.\n    # The .split_documents() method returns the page_content attribute of the Document object.\n    docs = text_splitter.split_documents(data)\n\n    # Create the prompts. The prompt is the instruction to the model. Prompting is key to getting good results.\n    # Play around with the prompt to get different results.\n    # We create two prompts. Since we will be using the refine summarize chain, we need a prompt for the initial 'summarization' of a chunk, and a prompt for the refinement of the summary of subsequent chunks.\n\n\n    # The first prompt is for the initial summarization of a chunk. You can add any info about yourself or the topic you want.\n    # You could specifically focus on a skill you have to get more relevant results.\n    strategy_template = \"\"\"\n        You are an expert in creating strategies for getting a four-hour workday. You are a productivity coach and you have helped many people achieve a four-hour workday.\n        You're goal is to create a detailed strategy for getting a four-hour workday.\n        The strategy should be based on the following text:\n        ------------\n        {text}\n        ------------\n        Given the text, create a detailed strategy. The strategy is aimed to get a working plan on how to achieve a four-hour workday.\n        The strategy should be as detailed as possible.\n        STRATEGY:\n    \"\"\"\n\n    PROMPT_STRATEGY = PromptTemplate(template=strategy_template, input_variables=[\"text\"])\n\n\n    # The second prompt is for the refinement of the summary, based on subsequent chunks.\n    strategy_refine_template = (\n    \"\"\"\n        You are an expert in creating strategies for getting a four-hour workday.\n        You're goal is to create a detailed strategy for getting a four-hour workday.\n        We have provided an existing strategy up to a certain point: {existing_answer}\n        We have the opportunity to refine the strategy\n        (only if needed) with some more context below.\n        ------------\n        {text}\n        ------------\n        Given the new context, refine the strategy.\n        The strategy is aimed to get a working plan on how to achieve a four-hour workday.\n        If the context isn't useful, return the original strategy.\n    \"\"\"\n    )\n\n    PROMPT_STRATEGY_REFINE = PromptTemplate(\n        input_variables=[\"existing_answer\", \"text\"],\n        template=strategy_refine_template,\n    )\n\n    # Initialize the large language model. You can use the gpt-3.5-turbo-16k model or any model you prefer.\n    # Play around with the temperature parameter to get different results. Higher temperature means more randomness. Lower temperature means more deterministic.\n    llm = ChatOpenAI(openai_api_key=openai_api_key, model_name='gpt-3.5-turbo-16k', temperature=0.5)\n\n    # Initiliaze the chain.\n    # The verbose parameter prints the 'thought process' of the model. It's useful for debugging.\n    strategy_chain = load_summarize_chain(llm=llm, chain_type='refine', verbose=True, question_prompt=PROMPT_STRATEGY, refine_prompt=PROMPT_STRATEGY_REFINE)\n    strategy = strategy_chain.run(docs)\n\n    # Now write the strategy to a file.\n    with open('strategy.txt', 'w') as f:\n        f.write(strategy)\n\n    # Now use this strategy to create a plan.\n    # The plan is a list of steps to take to achieve the goal.\n    # The plan is based on the strategy.\n\n    # Create the prompt for the plan.\n    plan_template = \"\"\"\n        You are an expert in creating plans for getting a four-hour workday. You are a productivity coach and you have helped many people achieve a four-hour workday.\n        You're goal is to create a detailed plan for getting a four-hour workday.\n        The plan should be based on the following strategy:\n        ------------\n        {strategy}\n        ------------\n        Given the strategy, create a detailed plan. The plan is aimed to get a working plan on how to achieve a four-hour workday.\n        Think step by step.\n        The plan should be as detailed as possible.\n        PLAN:\n    \"\"\"\n\n    PROMPT_PLAN = PromptTemplate(template=plan_template, input_variables=[\"strategy\"])\n\n    # Initialize the chain.\n    plan_chain = LLMChain(llm=llm, prompt=PROMPT_PLAN, verbose=True)\n    plan = plan_chain(strategy)\n\n    # Now write the plan to a file.\n    with open('plan.txt', 'w') as f:\n        f.write(plan['text'])\n\n# Print the total cost of the API calls.\nprint(cb)"
  },
  {
    "path": "day_2/requirements.txt",
    "content": "openai\nlangchain\ntiktoken"
  },
  {
    "path": "day_2/summary_example.txt",
    "content": "The meeting took place on February 18, 2021, and was focused on the engineering key review at GitLab. Eric Johnson, the meeting organizer, proposed breaking up the meeting into four department key reviews: engineering, development quality, security, and UX infrastructure and support. The reasons for this proposal were increased visibility, the ability to go deeper into each department's work, increased objectivity for managers, more time for Eric to focus on new markets, and a shift into a question-asking role rather than generating content and answering questions. To avoid adding three new meetings to stakeholders' calendars, Eric suggested a two-month rotation, with development quality going in month one and security and UX going in month two. The group expressed support for this proposal, with some members suggesting that the larger development department may need more frequent meetings. However, they agreed to try the proposed rotation and remain flexible.\n\nThe discussion then moved to the R&D overall MR rate and R&D wider MR rate, which are top-level key performance indicators (KPIs) for engineering. The wider MR rate includes both community contributions and community merge requests (MRs), while the overall MR rate includes all MRs. Eric raised concerns about the duplication between the two rates and suggested simplifying the metrics. Lily confirmed that the wider MR rate only captures community contributions, and Eric proposed tracking the percentage of total MRs that come from the community as a KPI instead. The group agreed with this proposal and decided to make the transition.\n\nChristopher mentioned a lag issue with metrics updates in the month of February, particularly in development and MR development. The data team was working on resolving this issue. The discussion then shifted to the Postgres replication issue and who should be responsible for addressing it. Eric clarified that the data engineering team should be the DRI (directly responsible individual) for the issue, with infrastructure owning the data source. They discussed the need for a dedicated host, tuning improvements, and addressing overall demand on the database layer. Steve expressed his willingness to partner with Eric on this issue and suggested focusing on getting the biggest impact for the resources allocated.\n\nMech provided an update on defect tracking and meeting service level objectives (SLOs). They mentioned a first iteration performance indicator (PI) that shows the percentage of defects meeting SLOs, with S1 defects at 80% and S2 defects at 60%. They also mentioned working on measuring the average age of open bugs to get a holistic view of the backlog. Craig raised a concern about a spike in mean time to close for S2 defects and asked for insights. Mech noted that they hadn't seen a dip in age or overall count and suggested digging deeper into the issue. Christy suggested that the change in severity levels across the board may be a factor to consider.\n\nMAIN TAKEAWAYS:\n- The proposal to break up the meeting into four department key reviews was supported, with a two-month rotation plan.\n- The R&D wider MR rate will be transitioned to tracking the percentage of total MRs that come from the community.\n- There is a lag issue with metrics updates in February, particularly in development and MR development.\n- The Postgres replication issue will be addressed by the data engineering team, with infrastructure support.\n- Defect tracking and meeting SLOs are ongoing, with a focus on improving the average age of open bugs.\n\nACTION ITEMS:\n- Lily and Max will work on transitioning the R&D wider MR rate to tracking the percentage of total MRs from the community.\n- Steve will provide an update on the Postgres replication issue in the infrastructure key review.\n- Mech will investigate the spike in mean time to close for S2 defects and provide further insights.\n- Christy will explore if the change in severity levels impacted the metrics.\n\nDECISIONS:\n- The meeting will be broken up into four department key reviews with a two-month rotation plan.\n- The R&D wider MR rate will be transitioned to tracking the percentage of total MRs from the community.\n\nOPEN QUESTIONS:\n- None mentioned.\n\nNEXT STEPS:\n- Lily and Max will work on transitioning the R&D wider MR rate.\n- Steve will provide an update on the Postgres replication issue in the infrastructure key review.\n- Mech will investigate the spike in mean time to close for S2 defects.\n- Christy will explore the impact of the change in severity levels on the metrics."
  },
  {
    "path": "day_2/voice_to_meeting_notes.py",
    "content": "\"\"\"\nThis script shows how to create a meeting notes based on your recordings.\nWe're using an easy LangChain implementation to show how to use the different components of LangChain.\nAlso includes an integration with OpenAI Whisper.\n\nThis is part of my '7 Days of LangChain' series. \nCheck out the explanation about the code on my Twitter (@JorisTechTalk)\n\n\"\"\"\n\nimport openai\nfrom langchain.docstore.document import Document\nfrom langchain.text_splitter import TokenTextSplitter\nfrom langchain.chains.summarize import load_summarize_chain\nfrom langchain.chat_models import ChatOpenAI\nfrom langchain.prompts import PromptTemplate\nimport os\n\n# Set your API key\nopenai_api_key = 'YOUR_API_KEY_HERE'\n# os.environ[\"OPENAI_API_KEY\"] = 'YOUR_API_KEY_HERE'\n\n# Set your media file path\nmedia_file_path = \"meeting_chunk0.mp3\"\n\n# Open the media file\nmedia_file = open(media_file_path, \"rb\")\n\n# Set your model ID\nmodel_id = \"whisper-1\"\n\n# Call the API\nresponse = openai.Audio.transcribe(\n    api_key=openai_api_key,\n    model=model_id,\n    file=media_file\n)\n\n# Assign the transcript to a variable\ntranscript = response[\"text\"]\n\n# Split the text\ntext_splitter = TokenTextSplitter(model_name=\"gpt-3.5-turbo-16k\", chunk_size=10000, chunk_overlap=300)\ntexts = text_splitter.split_text(transcript)\n\n# Create documents for further processing\ndocs = [Document(page_content=t) for t in texts]\n\n# Create the prompts\n\nprompt_template_summary = \"\"\"\nYou are a management assistant with a specialization in note taking. You are taking notes for a meeting.\n\nWrite a detailed summary of the following transcript of a meeting:\n\n\n{text}\n\nMake sure you don't lose any important information. Be as detailed as possible in your summary. \n\nAlso end with a list of:\n\n- Main takeaways\n- Action items\n- Decisions\n- Open questions\n- Next steps\n\nIf there are any follow-up meetings, make sure to include them in the summary and mentioned it specifically.\n\n\nDETAILED SUMMARY IN ENGLISH:\"\"\"\nPROMPT_SUMMARY = PromptTemplate(template=prompt_template_summary, input_variables=[\"text\"])\nrefine_template_summary = (\n'''\nYou are a management assistant with a specialization in note taking. You are taking notes for a meeting.\nYour job is to provide detailed summary of the following transcript of a meeting:\nWe have provided an existing summary up to a certain point: {existing_answer}.\nWe have the opportunity to refine the existing summary (only if needed) with some more context below.\n----------------\n{text}\n----------------\nGiven the new context, refine the original summary in English.\nIf the context isn't useful, return the original summary. Make sure you are detailed in your summary.\nMake sure you don't lose any important information. Be as detailed as possible. \n\nAlso end with a list of:\n\n- Main takeaways\n- Action items\n- Decisions\n- Open questions\n- Next steps\n\nIf there are any follow-up meetings, make sure to include them in the summary and mentioned it specifically.\n\n'''\n)\nrefine_prompt_summary = PromptTemplate(\n    input_variables=[\"existing_answer\", \"text\"],\n    template=refine_template_summary,\n)\n\n# Initialize LLM\nllm = ChatOpenAI(openai_api_key=openai_api_key,temperature=0.2, model_name=\"gpt-3.5-turbo-16k\")\n\n# Create a summary\nsum_chain = load_summarize_chain(llm, chain_type=\"refine\", verbose=True, question_prompt=PROMPT_SUMMARY, refine_prompt=refine_prompt_summary)\nsummary = sum_chain.run(docs)\n\n# Write the response to a file\nwith open(\"summary.txt\", \"w\") as f:\n    f.write(summary)"
  },
  {
    "path": "day_3/mindmap.py",
    "content": "\"\"\"\nThis script shows how to create a mindmap based on your study material.\nWe're using an easy LangChain implementation to show how to use the different components of LangChain.\n\nOnce you have your markdown mindmap, import it to Xmind to create a mindmap.\nThis is part of my '7 Days of LangChain' series. \n\nCheck out the explanation about the code on my Twitter (@JorisTechTalk)\n\n\"\"\"\n\nfrom langchain.chat_models import ChatOpenAI\nfrom langchain.text_splitter import TokenTextSplitter\nfrom langchain.chains.summarize import load_summarize_chain\nfrom langchain.prompts import PromptTemplate\nfrom PyPDF2 import PdfReader\nfrom langchain.docstore.document import Document\nfrom langchain.callbacks import get_openai_callback\n\n# Set your OpenAI API Key.\nopenai_api_key = 'YOUR_API_KEY_HERE'\n\n# Set file path\nfile_path = 'eight.pdf'\n\n# Load Data from PDF for Question Generation\nloader_mindmap = PdfReader(file_path)\n\n# Store all the text in a variable\ntext = \"\"\nfor page in loader_mindmap.pages:\n    text += page.extract_text()\n\n# Split Data For Mindmap Generation\ntext_splitter = TokenTextSplitter(model_name=\"gpt-3.5-turbo-16k\", chunk_size=10000, chunk_overlap=1000)\ntexts_for_mindmap = text_splitter.split_text(text)\ndocs_for_mindmap = [Document(page_content=t) for t in texts_for_mindmap]\n\n# Template for the question generation for every document\n\nprompt_template_mindmap = \"\"\"\n\nYou are an experienced assistant in helping people understand topics through the help of mind maps.\n\nYou are an expert in the field of the requested topic.\n\nMake a mindmap based on the context below. Try to make connections between the different topics and be concise.:\n\n------------\n{text}\n------------\n\nThink step by step.\n\nAlways answer in markdown text. Adhere to the following structure:\n\n## Main Topic 1\n\n### Subtopic 1\n- Subtopic 1\n    -Subtopic 1\n    -Subtopic 2\n    -Subtopic 3\n\n### Subtopic 2\n- Subtopic 1\n    -Subtopic 1\n    -Subtopic 2\n    -Subtopic 3\n\n## Main Topic 2\n\n### Subtopic 1\n- Subtopic 1\n    -Subtopic 1\n    -Subtopic 2\n    -Subtopic 3\n\nMake sure you only put out the Markdown text, do not put out anything else. Also make sure you have the correct indentation.\n\n\nMINDMAP IN MARKDOWN:\n\n\"\"\"\n\nPROMPT_MINDMAP = PromptTemplate(template=prompt_template_mindmap, input_variables=[\"text\"])\n\n# Template for refining the mindmap\n\nrefine_template_mindmap = (\"\"\"\n\nYou are an experienced assistant in helping people understand topics through the help of mind maps.\n\nYou are an expert in the field of the requested topic.\n\nWe have received some mindmap in markdown to a certain extent: {existing_answer}.\nWe have the option to refine the existing mindmap or add new parts. Try to make connections between the different topics and be concise.\n(only if necessary) with some more context below\n\"------------\\n\"\n\"{text}\\n\"\n\"------------\\n\"\n\n\nAlways answer in markdown text. Try to make connections between the different topics and be concise. Adhere to the following structure:\n\n## Main Topic 1\n\n### Subtopic 1\n- Subtopic 1\n    -Subtopic 1\n    -Subtopic 2\n    -Subtopic 3\n\n### Subtopic 2\n- Subtopic 1\n    -Subtopic 1\n    -Subtopic 2\n    -Subtopic 3\n\n## Main Topic 2\n\n### Subtopic 1\n- Subtopic 1\n    -Subtopic 1\n    -Subtopic 2\n    -Subtopic 3\n\nMake sure you only put out the Markdown text, do not put out anything else. Also make sure you have the correct indentation.\n\nMINDMAP IN MARKDOWN:\n\"\"\"\n)\n                             \nREFINE_PROMPT_MINDMAP = PromptTemplate(\n    input_variables=[\"existing_answer\", \"text\"],\n    template=refine_template_mindmap,\n)\n\n# Tracking cost\nwith get_openai_callback() as cb:\n\n    # Initialize the LLM\n    llm_markdown = ChatOpenAI(openai_api_key=openai_api_key, temperature=0.3, model=\"gpt-3.5-turbo-16k\")\n\n    # Initialize the summarization chain\n    summarize_chain = load_summarize_chain(llm=llm_markdown, chain_type=\"refine\", verbose=True, question_prompt=PROMPT_MINDMAP, refine_prompt=REFINE_PROMPT_MINDMAP)\n\n    # Generate mindmap\n    mindmap = summarize_chain(docs_for_mindmap)\n\n    # Save mindmap to .md file\n    with open(\"mindmap.md\", \"w\") as f:\n        f.write(mindmap['output_text'])\n\n# Print cost\nprint(cb)"
  },
  {
    "path": "day_3/mindmap_example.md",
    "content": "## Eight Things to Know about Large Language Models\n\n### Main Topic 1: Predictability and Capabilities\n\n- Subtopic 1: LLMs get more capable with increasing investment\n    - Increasing investment leads to improved performance\n    - Scaling laws allow for precise prediction of capabilities\n    \n- Subtopic 2: Unpredictable emergence of important behaviors\n    - Specific behaviors can emerge unexpectedly with increasing investment\n    - Models can fail at a task consistently, but a larger model may succeed\n    \n### Main Topic 2: Understanding and Interpretation\n\n- Subtopic 1: LLMs learn and use representations of the outside world\n    - Internal representations of color, objects, and geography\n    - Ability to reason at an abstract level\n    \n- Subtopic 2: Lack of reliable techniques for steering LLM behavior\n    - Limited control over LLM behavior\n    - Challenges in interpreting and guiding LLMs\n    \n### Main Topic 3: Performance and Values\n\n- Subtopic 1: LLM performance surpasses human performance\n    - LLMs trained on more data can outperform humans\n    - Additional training methods improve performance\n    \n- Subtopic 2: LLMs do not necessarily express the values of their creators or web text\n    - Values expressed by LLMs can be controlled and influenced\n    - Third-party input and oversight can shape LLM values\n    \n### Main Topic 4: Interaction and Misleading Behavior\n\n- Subtopic 1: Brief interactions with LLMs can be misleading\n    - Models can be sensitive to instructions and prompt wording\n    - Contingent failures do not necessarily indicate lack of capability"
  },
  {
    "path": "day_3/requirements.txt",
    "content": "openai\nlangchain\ntiktoken\npypdf2"
  },
  {
    "path": "day_4/newsletter_example.txt",
    "content": "Title: Can LLMs Express Their Uncertainty? An Empirical Evaluation of Confidence Elicitation in LLMs\nSummary: This article explores different approaches for estimating the uncertainty of large language models (LLMs) without relying on model fine-tuning or proprietary information. The study introduces verbalize-based, consistency-based, and hybrid methods for benchmarking and evaluates their performance across various datasets and LLMs. The analysis reveals insights such as LLMs often exhibiting overconfidence when verbalizing their confidence and consistency-based methods outperforming verbalized confidences in most cases. The article concludes that hybrid methods show the most promising performance, but there is still room for improvement in confidence elicitation.\nLink: http://arxiv.org/abs/2306.13063v1\n\nTitle: Towards Explainable Evaluation Metrics for Machine Translation\nSummary: This concept paper discusses the need for explainable evaluation metrics for machine translation, as current metrics based on large language models lack transparency. The article identifies key properties and goals of explainable machine translation metrics and provides a synthesis of recent techniques and approaches. It also explores explainable metrics based on generative models like ChatGPT and GPT4. The article envisions next-generation approaches, including natural language explanations, to improve the transparency and acceptance of high-quality metrics for machine translation.\nLink: http://arxiv.org/abs/2306.13041v1\n\nTitle: Tracking public attitudes toward ChatGPT on Twitter using sentiment analysis and topic modeling\nSummary: This article investigates public attitudes towards ChatGPT, a chatbot powered by a large language model, using sentiment analysis and topic modeling techniques applied to Twitter data. The analysis reveals that the overall sentiment towards ChatGPT is largely neutral to positive across different occupation groups. The most popular topics mentioned in tweets related to ChatGPT include Artificial Intelligence, Search Engines, Education, Writing, and Question Answering.\nLink: http://arxiv.org/abs/2306.12951v1\n\nTitle: Generative Multimodal Entity Linking\nSummary: This article introduces GEMEL, a simple yet effective method for multimodal entity linking (MEL) that leverages large language models (LLMs) for generating target entity names. Unlike previous complex MEL methods, GEMEL only fine-tunes a linear layer while keeping the vision and language model frozen. The approach utilizes in-context learning capability of LLMs and achieves state-of-the-art results on two well-established MEL datasets with minimal fine-tuning. The article highlights the potential of using LLMs in the MEL task for efficient and general solutions.\nLink: http://arxiv.org/abs/2306.12725v1"
  },
  {
    "path": "day_4/requirements.txt",
    "content": "langchain\nopenai\ntabulate\ntiktoken\ngoogle-api-python-client\ngoogle-auth-oauthlib\ngoogle-auth-httplib2\nbeautifulsoup4"
  },
  {
    "path": "day_4/scientific_newsletter.py",
    "content": "\"\"\"\nThis script shows how to create a newsletter based on the latest Arxiv articles.\nWe're using an easy LangChain implementation to show how to use the different components of LangChain.\nThis is part of my '7 Days of LangChain' series. \n\nCheck out the explanation about the code on my Twitter (@JorisTechTalk)\n\n\"\"\"\n\nfrom langchain.document_loaders import ArxivLoader\nfrom langchain.agents.agent_toolkits import GmailToolkit\nfrom langchain import OpenAI\nimport os\nfrom langchain.agents import initialize_agent, AgentType\nfrom langchain.chat_models import ChatOpenAI\nfrom langchain.prompts import PromptTemplate\nfrom langchain import LLMChain\nfrom langchain.callbacks import get_openai_callback\nimport arxiv\n\n# Topic of the newsletter you want to write about\nquery = \"LLM\"\n\n# Set up the ArxivLoader\nsearch = arxiv.Search(\n  query = query,\n  max_results = 4,\n  sort_by = arxiv.SortCriterion.SubmittedDate\n)\n\n# Initialize the docs variable\ndocs = \"\"\n\n# Add all relevant information to the docs variable\nfor result in search.results():\n    docs += \"Title: \" + result.title + \"\\n\"\n    docs += \"Abstract: \" + result.summary + \"\\n\"\n    docs += \"Download URL: \" + result.pdf_url + \"\\n\"\n    print(result.links)\n    for link in result.links:\n        docs += \"Links: \" + link.href + \"\\n\"\n\n# Track cost\nwith get_openai_callback() as cb:\n\n    # Template for the newsletter\n    prompt_newsletter_template = \"\"\"\n    You are a newsletter writer. You write newsletters about scientific articles. You introduce the article and show a small summary to tell the user what the article is about.\n\n    You're main goal is to write a newsletter which contains summaries to interest the user in the articles.\n\n    --------------------\n    {text}\n    --------------------\n\n    Start with the title of the article. Then, write a small summary of the article.\n\n    Below each summary, include the link to the article containing /abs/ in the URL.\n\n    Summaries:\n\n    \"\"\"\n\n    PROMPT_NEWSLETTER = PromptTemplate(template=prompt_newsletter_template, input_variables=[\"text\"])\n\n    # Set the OpenAI API key\n    os.environ['OPENAI_API_KEY'] = 'YOUR_API_KEY_HERE'\n\n    # Initialize the language model\n    llm = ChatOpenAI(temperature=0.6, model_name=\"gpt-3.5-turbo-16k\", verbose=True)\n\n    # Initialize the LLMChain\n    newsletter_chain = LLMChain(llm=llm, prompt=PROMPT_NEWSLETTER, verbose=True)\n\n    # Run the LLMChain\n    newsletter = newsletter_chain.run(docs)\n\n    # Write newsletter to a text file\n    with open(\"newsletter.txt\", \"w\") as f:\n        f.write(newsletter)\n\n    # Set toolkit\n    toolkit = GmailToolkit() \n\n    # Initialize the Gmail agent\n    agent = initialize_agent(\n        tools=toolkit.get_tools(),\n        llm=llm,\n        agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,\n        verbose=True\n    )\n\n    # Run the agent\n    instructions = f\"\"\"\n    Write a draft directed to jorisdejong456@gmail.com, NEVER SEND THE EMAIL. \n    The subject should be 'Scientific Newsletter about {query}'. \n    The content should be the following: {newsletter}.\n    \"\"\"\n    agent.run(instructions)\n    print(cb)"
  },
  {
    "path": "day_5/podcast.py",
    "content": "# PODCAST Q&A BOT\n\nfrom langchain.text_splitter import TokenTextSplitter\nfrom langchain.docstore.document import Document\nfrom langchain.chat_models import ChatOpenAI\nfrom langchain.document_loaders import YoutubeLoader\nfrom langchain.prompts import PromptTemplate\nfrom langchain.chains.summarize import load_summarize_chain\nfrom langchain.callbacks import get_openai_callback\nfrom langchain.embeddings.openai import OpenAIEmbeddings\nfrom langchain.vectorstores import FAISS\nfrom langchain.chains import RetrievalQA\n\nwith get_openai_callback() as cb:\n\n    # Load a youtube video and get the transcript\n    loader = YoutubeLoader.from_youtube_url('https://www.youtube.com/watch?v=-hxeDjAxvJ8', add_video_info=True)\n    data = loader.load()\n\n    # Initialize text splitter for summary (Large chunks for better context and less API calls)\n    text_splitter_summary = TokenTextSplitter(chunk_size = 10000, chunk_overlap = 250)\n\n    # Split text into docs for summary\n    docs_summary = text_splitter_summary.split_documents(data)\n\n    # Initialize text splitter for QA (Smaller chunks for better QA)\n    text_splitter_qa = TokenTextSplitter(chunk_size = 1000, chunk_overlap = 200)\n\n    # Split text into docs for QA\n    docs_qa = text_splitter_qa.split_documents(data)\n\n    # Prompts for summary\n\n    # The first prompt is for the initial summarization of a chunk. You can add any info about yourself or the topic you want.\n    # You could specifically focus on a skill you have to get more relevant results.\n    summary_template = \"\"\"\n        You are an expert in summarizing YouTube videos.\n        You're goal is to create a summary of a podcast.\n        Below you find the transcript of a podcast:\n        ------------\n        {text}\n        ------------\n\n        The transript of the podcast will also be used as the basis for a question and answer bot.\n        Provide some examples questions and answers that could be asked about the podcast. Make these questions very specific.\n\n        Total output will be a summary of the video and a list of example questions the user could ask of the video.\n\n        SUMMARY AND QUESTIONS:\n    \"\"\"\n\n    PROMPT_SUMMARY = PromptTemplate(template=summary_template, input_variables=[\"text\"])\n\n\n    # The second prompt is for the refinement of the summary, based on subsequent chunks.\n    summary_refine_template = (\n    \"\"\"\n        You are an expert in summarizing YouTube videos.\n        You're goal is to create a summary of a podcast.\n        We have provided an existing summary up to a certain point: {existing_answer}\n        We have the opportunity to refine the summary\n        (only if needed) with some more context below.\n        Below you find the transcript of a podcast:\n        ------------\n        {text}\n        ------------\n        Given the new context, refine the summary and example questions.\n        The transript of the podcast will also be used as the basis for a question and answer bot.\n        Provide some examples questions and answers that could be asked about the podcast. Make these questions very specific.\n        If the context isn't useful, return the original summary and questions.\n        Total output will be a summary of the video and a list of example questions the user could ask of the video.\n\n        SUMMARY AND QUESTIONS:\n    \"\"\"\n    )\n\n    PROMPT_SUMMARY_REFINE = PromptTemplate(\n        input_variables=[\"existing_answer\", \"text\"],\n        template=summary_refine_template,\n    )\n\n    # Set OPENAI API key\n    openai_api_key = 'YOUR_API_KEY'\n\n    # Initialize LLM\n    llm_summary = ChatOpenAI(openai_api_key=openai_api_key, model_name='gpt-3.5-turbo-16k', temperature=0.3)\n\n    # Initialize summarization chain\n    summarize_chain = load_summarize_chain(llm=llm_summary, chain_type=\"refine\", verbose=True, question_prompt=PROMPT_SUMMARY, refine_prompt=PROMPT_SUMMARY_REFINE)\n    summary = summarize_chain.run(docs_summary)\n\n    # Write summary to file\n    with open(\"summary.txt\", \"w\") as f:\n        f.write(summary)\n\n    # Create the LLM model for the question answering\n    llm_question_answer = ChatOpenAI(openai_api_key=openai_api_key,temperature=0.2, model=\"gpt-3.5-turbo-16k\")\n\n    # Create the vector database and RetrievalQA Chain\n    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)\n    db = FAISS.from_documents(docs_qa, embeddings)\n    qa = RetrievalQA.from_chain_type(llm=llm_question_answer, chain_type=\"stuff\", retriever=db.as_retriever())\n\n\n    question = \"\"\n    # Run the QA chain continuously\n    while question != \"exit\":\n        # Get the user question\n        question = input(\"Ask a question or enter exit to close the app: \")\n        # Run the QA chain\n        answer = qa.run(question)\n        print(answer)\n        print(\"---------------------------------\")\n        print(\"\\n\")\n\nprint(cb)"
  },
  {
    "path": "day_5/requirements.txt",
    "content": "langchain\nopenai\ntiktoken\nyoutube-transcript-api\npytube\nfaiss-cpu\n"
  },
  {
    "path": "day_6/compare_files.py",
    "content": "from pydantic import BaseModel, Field\nfrom langchain.chat_models import ChatOpenAI\nfrom langchain.agents import Tool\nfrom langchain.embeddings.openai import OpenAIEmbeddings\nfrom langchain.text_splitter import CharacterTextSplitter\nfrom langchain.vectorstores import FAISS\nfrom langchain.document_loaders import PyPDFLoader\nfrom langchain.chains import RetrievalQA\nfrom langchain.agents import initialize_agent\nfrom langchain.agents import AgentType\nimport os\n\n# Create a custom input schema\nclass DocumentInput(BaseModel):\n    question: str = Field()\n\n# Set OpenAI API key as environment variable\nos.environ[\"OPENAI_API_KEY\"] = \"YOUR_OPENAI_API_KEY\"\n\n# List of files you want to compare\nfiles = [\n    {\n        \"name\": \"Volkswagen-earnings-Q1-2023\",\n        \"path\": \"files/Volkswagen-Q1_2023.pdf\"\n    },\n    {\n        \"name\": \"tesla-earning-Q1-2023\",\n        \"path\": \"files/TSLA-Q1-2023-Update.pdf\"\n    },\n]\n\n# Initialize a list of tools\ntools = []\n\n# Initialize the LLM\nllm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")\n\n# Loop over the files\nfor file in files:\n    # Load the documents\n    loader = PyPDFLoader(file[\"path\"])\n    pages = loader.load_and_split()\n\n    # Split the documents into chunks\n    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n    docs = text_splitter.split_documents(pages)\n    print(f\"Loaded {len(docs)} documents from {file['name']}\")\n\n    # Vectorize the documents and create a retriever\n    embeddings = OpenAIEmbeddings()\n    retriever = FAISS.from_documents(docs, embeddings).as_retriever()\n    \n    # Wrap retrievers in a Tool\n    tools.append(\n        Tool(\n            args_schema=DocumentInput,\n            name=file[\"name\"], \n            description=f\"useful when you want to answer questions about {file['name']}\",\n            func=RetrievalQA.from_chain_type(llm=llm, retriever=retriever)\n        )\n    )\n\n# Initialize LLM for the agent\nllm = ChatOpenAI(\n    temperature=0,\n    model=\"gpt-3.5-turbo-0613\", \n)\n\n# Initialize the agent\nagent = initialize_agent(\n    agent=AgentType.OPENAI_FUNCTIONS,\n    tools=tools,\n    llm=llm,\n    verbose=True,\n)\n\n# Initialize the question variable\nquestion = \"\"\n\n# Run a loop to ask questions\nwhile True and question != \"exit\":\n    question = input(\"Ask a question or write exit to quit: \")\n    if question == \"exit\":\n        break\n    answer = agent({\"input\": question})\n    print(answer[\"output\"])\n    print(\"------\")"
  },
  {
    "path": "day_6/requirements.txt",
    "content": "langchain\nopenai\npypdf\ntiktoken\nfaiss-cpu\npycryptodome"
  },
  {
    "path": "day_7/learning_path.py",
    "content": "from langchain.document_loaders import YoutubeLoader\nfrom langchain.text_splitter import TokenTextSplitter\nfrom langchain.prompts import PromptTemplate\nfrom langchain.chains.summarize import load_summarize_chain\nfrom langchain.callbacks import get_openai_callback\nfrom langchain.chat_models import ChatOpenAI\nfrom langchain.vectorstores import FAISS\nfrom langchain.docstore.document import Document\nfrom langchain.embeddings.openai import OpenAIEmbeddings\nfrom langchain.chains import RetrievalQA\nfrom langchain import LLMChain\nimport os\n\n# Set openai api key as environment variable\n# os.environ[\"OPENAI_API_KEY\"] = \"YOUR_API_KEY\"\n\n# Set OpenAI API key as environment variable\nos.environ[\"OPENAI_API_KEY\"] = \"YOUR_OPENAI_API_KEY\"\n\n# List of Youtube Urls\n# If you want to load all the videos from a channel, use the code in youtube_ids.py\nyoutube_urls = [\n    \"https://www.youtube.com/watch?v=pEkxRQFNAs4\", # Extract Topics From Video/Audio With LLMs (Topic Modeling w/ LangChain)\n    \"https://www.youtube.com/watch?v=GvQC5BHBkoM\", # 8 Things New SaaS Developers Need To Know on DAY 1\n    \"https://www.youtube.com/watch?v=QoBCtcWO02g\", # The One Person Business Model 2.0 (Turn Yourself Into A Business)\n    \"https://www.youtube.com/watch?v=mMv6OSuitWw\", # Python 101: Learn the 5 Must-Know Concepts\n    \"https://www.youtube.com/watch?v=2zW5emKWof8\", # 10 Years of Coding: What I Wish I Knew Before I Started\n]\n\n# Create a template for extracting topics from a text. \nextract_topics_template = \"\"\"\n    You are an expert in extracting skills being thaught from a transcript of a video.\n    You're goal is to extract the skills thaught from the transcript below.\n    The skills will be used to give the user an idea of what will be learned in the video.\n\n    Transcript:\n    ------------\n    {text}\n    ------------\n\n    The description of the skills should be descriptive, but short and concise. Mention what overarching skill would be learned.\n    \n    Example:\n\n    Implementing continuous delivery for faster shipping - Software development\n    Evaluating and selecting a suitable tech stack for SaaS development - Software development\n    Recognizing the importance of marketing and customer communication in building a successful SaaS business - Business and marketing\n\n    Don't add numbers. Just each skill on a new line.\n\n    SKILLS - OVERARCHING SKILL:\n\"\"\"\n\nPROMPT_EXTRACT_TOPICS = PromptTemplate(template=extract_topics_template, input_variables=[\"text\"])\n\n\n# The second prompt is for the refinement of the summary and topics, based on subsequent chunks.\nextract_topics_refine_template = (\n\"\"\"\n    You are an expert in extracting skills from a transcript of a video.\n    You're goal is to extract the skills thaught from the transcript below.\n    The skills will be used to give the user an idea of what will be learned in the video.\n\n    We have provided a list of skills up to a certain point: {existing_answer}\n    We have the opportunity to refine the skills\n    (only if needed) with some more context below.\n    ------------\n    {text}\n    ------------\n    Given the new context, refine the skills discussed.\n    If the context isn't useful, return the list of skills.\n    The description of the skills should be descriptive, but short and concise. Mention what overarching skill would be learned.\n\n    Example:\n\n    Implementing continuous delivery for faster shipping - Software development\n    Evaluating and selecting a suitable tech stack for SaaS development - Software development\n    Recognizing the importance of marketing and customer communication in building a successful SaaS business - Business and marketing\n\n    Don't add numbers. Just each skill on a new line.\n\n    SKILLS - OVERARCHING SKILL:\n\"\"\"\n)\n\nPROMPT_EXTRACT_TOPICS_REFINE = PromptTemplate(\n    input_variables=[\"existing_answer\", \"text\"],\n    template=extract_topics_refine_template,\n)\n\n# Prompt for genarting a list of subskills needed to master a skill\n\nsubskills_template = \"\"\"\nYou are an assistant specialized in desiging learning paths for people trying to acquire a particular skill-set. \n\nYour goal is to make a list of sub skills a person needs to become proficient in a particular skill.\n\nThe skill set you need to design a learning path for is: {skill_set}\n\nThe user will say which skill set they want to learn, and you'll provide a short and consice list of specific skills this person needs to learn. \n\nThis list will be used to find YouTube videos related to those skills. Don't mention youtube videos though! Name only 5 skills maximum.\n\"\"\"\n\nPROMPT_SUBSKILLS = PromptTemplate(template=subskills_template, input_variables=[\"skill_set\"])\n\n# Prompt for finding a video based on a subskill set\n\nfind_video_template = \"\"\"\nYou are an assistant specialized in desiging learning paths for people trying to acquire a particular skill-set.\n\nYour goal is to find a list of videos that teaches a particular skill.\n\nIt should be based on the following context:\n\n{context}\n\nLook for videos that teach the following skills: {skill_set}\n\nRETURN A LIST OF VIDEOS WITH YOUTUBE URL AND TITLE:\n\"\"\"\n\nPROMPT_FIND_VIDEO = PromptTemplate(template=find_video_template, input_variables=[\"context\",\"skill_set\"])\n\n# Initialize the large language model. You can use the gpt-3.5-turbo-16k model or any model you prefer.\n# Play around with the temperature parameter to get different results. Higher temperature means more randomness. Lower temperature means more deterministic.\nllm = ChatOpenAI(model_name='gpt-3.5-turbo-16k', temperature=0)\n\n\n# Initialize empty document list\ndocuments = []\n\nwith get_openai_callback() as cb:\n\n    # Loop over the youtube urls\n    for url in youtube_urls:\n\n        # Load a youtube video and get the transcript\n        youtube_url = url\n        loader = YoutubeLoader.from_youtube_url(youtube_url=youtube_url, add_video_info=True)\n        data = loader.load()\n        metadata = data[0].metadata\n        title = metadata['title']\n        author = metadata['author']\n\n        # Split the transcript into shorter chunks.\n        # First create the text splitter. The chunk_size is the maximum number of tokens in each chunk.\n        text_splitter = TokenTextSplitter(chunk_size = 2000, chunk_overlap = 100)\n\n        # Then split the transcript into chunks.\n        # The .split_documents() method returns the page_content attribute of the Document object.\n        docs = text_splitter.split_documents(data)\n\n        # Initialize the summarization chain\n        extract_topics_chain = load_summarize_chain(llm=llm, chain_type=\"refine\", verbose=True, question_prompt = PROMPT_EXTRACT_TOPICS, refine_prompt = PROMPT_EXTRACT_TOPICS_REFINE)\n        extracted_topics = extract_topics_chain(docs)\n\n        video_overview = \"\"\n\n        # Add the YouTube Channel name, video title, URL and extracted topics to the video overview\n        video_overview += f\"YouTube Channel: {author}\\n\"\n        video_overview += f\"YouTube Video: {title}\\n\"\n        video_overview += f\"YouTube URL: {youtube_url}\\n\"\n        video_overview += \"Skills: \\n\"\n        video_overview += extracted_topics['output_text']\n\n        # Create a document object with the video overview\n        docs = Document(page_content=video_overview, metadata={\"title\": title, \"author\": author, \"url\": youtube_url})\n\n        # Add the document to the documents list\n        documents.append(docs)\n\n    # Initialize the embeddings\n    embeddings = OpenAIEmbeddings()\n\n    # Create a vector store with the documents  \n    vector_store = FAISS.from_documents(documents, embeddings)\n\n    # Save the vector store\n    vector_store.save_local(\"vector/\", \"vector_store\")\n\n    # Load the vector store\n    vector_store = FAISS.load_local(\"vector/\", embeddings, \"vector_store\")\n\n    # Initialize the subskills chain\n    subskills_chain = LLMChain(llm=llm, prompt=PROMPT_SUBSKILLS)\n\n    # Loop for questions\n    while True:\n        # Ask the user what skill they want to learn\n        skill_set = input(\"What skill set do you want to learn? \")\n\n        # Use skillset to find subskills\n        subskills = subskills_chain.predict(skill_set=skill_set)\n\n        # Print subskills\n        print(f\"Subskills: \\n {subskills}\\n\")\n\n        # Initialize the retrieval chain\n        qa = RetrievalQA.from_chain_type(llm = llm, retriever = vector_store.as_retriever(), chain_type=\"stuff\", verbose=True)\n\n        # Set query to ask\n        query = f\"Which Youtube videos teach {subskills}?\"\n\n        # Use query to find videos\n        videos = qa.run(query)\n\n        # Print videos\n        print(f\"Videos: \\n {videos}\\n\")\n    "
  },
  {
    "path": "day_7/requirements.txt",
    "content": "langchain\nopenai\ntiktoken\nunstructured\nyoutube-transcript-api\nfaiss-cpu\npytube\ngoogle-api-python-client"
  },
  {
    "path": "day_7/youtube_ids.py",
    "content": "# If you want to load all YouTube videos from a specific channel in one go, use these functions.\n\nimport googleapiclient.discovery\nfrom tqdm import tqdm\nfrom youtube_transcript_api import YouTubeTranscriptApi\nfrom youtube_transcript_api.formatters import TextFormatter\n\napi_key = \"Your Google Dev API Key\" #@param {type:\"string\"}\nchannel_id = \"\" #@param {type:\"string\"} # Get your channel ID here https://commentpicker.com/youtube-channel-id.php\n\ndef get_channel_videos(channel_id, api_key):\n    youtube = googleapiclient.discovery.build(\n        \"youtube\", \"v3\", developerKey=api_key)\n\n    video_ids = []\n    page_token = None\n\n    while True:\n        request = youtube.search().list(\n            part=\"snippet\",\n            channelId=channel_id,\n            maxResults=10,  # Fetch 50 videos at a time\n            pageToken=page_token  # Add pagination\n        )\n        response = request.execute()\n\n        video_ids += [item['id']['videoId'] for item in response['items'] if item['id']['kind'] == 'youtube#video']\n        \n        # Check if there are more videos to fetch\n        if 'nextPageToken' in response:\n            page_token = response['nextPageToken']\n        else:\n            break\n\n    return video_ids\n\ndef get_transcript(video_id):\n    # Get transcript list\n    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)\n    transcripts_manual = transcript_list._manually_created_transcripts\n\n    # Get transcript. If no manually created transcript is available, use the automatically generated one.\n    if transcripts_manual:\n        transcript = YouTubeTranscriptApi.get_transcript(video_id)\n    else:\n        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['nl', 'en'])\n\n    # Format transcript as text\n    formatter = TextFormatter()\n    text_transcript = formatter.format_transcript(transcript)\n    text_transcript = text_transcript.replace('\\n', ' ')\n\n    return text_transcript    \n\nvideo_ids = get_channel_videos(channel_id, api_key)\n\ntranscript = get_transcript(video_ids[0])"
  },
  {
    "path": "requirements.txt",
    "content": "langchain\nopenai\nyoutube-transcript-api\npytube\ntiktoken\nbs4\npypdf2\ngoogle-api-python-client\ngoogle-auth-oauthlib\ngoogle-auth-httplib2\nbeautifulsoup4\ntabulate\n"
  }
]