Repository: andkret/Cookbook Branch: master Commit: d58e9a70e031 Files: 24 Total size: 595.6 KB Directory structure: gitextract_czkzynip/ ├── .github/ │ └── workflows/ │ ├── copy-to-documenation-branch.yml │ └── linkchecker.yml ├── .gitignore ├── Code Examples/ │ ├── #102 Spark Week Day 3.txt │ ├── GenAI-RAG/ │ │ ├── conversations.json │ │ ├── cvpipeline.py │ │ ├── docker-compose.yml │ │ ├── index.py │ │ └── query.py │ └── Movies.txt ├── FUNDING.yml ├── LICENSE ├── README.md ├── images/ │ └── Data-Engineering-Roadmap-for.textClipping └── sections/ ├── 01-Introduction.md ├── 02-BasicSkills.md ├── 03-AdvancedSkills.md ├── 04-HandsOnCourse.md ├── 05-CaseStudies.md ├── 06-BestPracticesCloud.md ├── 07-DataSources.md ├── 08-InterviewQuestions.md ├── 09-BooksAndCourses.md └── 10-Updates.md ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/copy-to-documenation-branch.yml ================================================ name: Copy from master to documentation branch # Controls when the action will run. on: # Triggers the workflow on push request events but only for the master branch push: branches: [master] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: jobs: copy-images: runs-on: ubuntu-latest steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v2 - name: Copy Images uses: andstor/copycat-action@v3 with: personal_token: ${{ secrets.ACTION_TOKEN }} src_branch: master src_path: /images/. dst_owner: andkret dst_repo_name: Cookbook dst_path: /static/images/ dst_branch: documentation clean: true commit_message: "Images copied from master to documentation branch!" copy-sections: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Copy Markdowns uses: andstor/copycat-action@v3 with: personal_token: ${{ secrets.ACTION_TOKEN }} src_branch: master src_path: /sections/. dst_owner: andkret dst_repo_name: Cookbook dst_path: /docs/ dst_branch: documentation clean: true commit_message: "Sections copied from master to documentation branch!" # copy-readme: # runs-on: ubuntu-latest # steps: # - uses: actions/checkout@v2 # - name: Copy Markdowns # uses: andstor/copycat-action@v3 # with: # personal_token: ${{ secrets.ACTION_TOKEN }} # src_branch: master # src_path: README.md # dst_owner: andkret # dst_repo_name: Cookbook # dst_path: /docs/00-TableOfContents.md # dst_branch: documentation # clean: false # commit_message: "Readme copied from master to documentation branch!" # copy-readme: # runs-on: ubuntu-latest # steps: # - uses: actions/checkout@v2 # - name: Copy Markdowns # uses: andstor/copycat-action@v3 # with: # personal_token: ${{ secrets.PERSONAL_TOKEN }} # src_branch: master # src_path: /README.md # dst_owner: andkret # dst_repo_name: Cookbook # dst_path: /docs/ # dst_branch: documentation # commit_message: "README.md copied from master to documentation branch!" ================================================ FILE: .github/workflows/linkchecker.yml ================================================ #on: # schedule: # - cron: '0 9 * * 1' # workflow_dispatch: #jobs: # linkChecker: # runs-on: ubuntu-latest # steps: # - name: update setuptools # run: | # python3 -m pip install --upgrade pip setuptools wheel # - uses: actions/checkout@v2 # - name: Link Checker # uses: lycheeverse/lychee-action@master # with: # args: --verbose --no-progress --accept 200,204,206,406,429,999 --include-mail ./sections/*.md # - name: Create Issue From File # uses: peter-evans/create-issue-from-file@v5 # with: # title: Link Checker Report # content-filepath: ./lychee/out.md # labels: report, automated issue ================================================ FILE: .gitignore ================================================ # Ignore build artefacts *.aux *.log *.lof *.lot *.toc *.out *.synctex.gz node_modules/* ================================================ FILE: Code Examples/#102 Spark Week Day 3.txt ================================================ //Read in the textfile val input = sc.textFile("/notebook/Movies.txt") case class MovieLine(Line: String) val movieline = input.map(line => MovieLine(line)) movieline.toDF().registerTempTable("MovieLine") // Lets map the date and the genre case class DateAndGenre(myDate: String, Genre: String) val dateandgenre = input.map(line => line.split(";")).map(s => DateAndGenre( s(0),s(3) )) dateandgenre.toDF().registerTempTable("DateAndGenre") // count how many movies per year case class MovieDate(Line: String, myCount: Int) val countdate = input.map(line => line.split(";")).map(s => (s(0),1)) countdate.toDF().registerTempTable("countdate") val reduceddate = countdate.reduceByKey((a,b) => a + b).map(s => MovieDate(s._1,s._2)) reduceddate.toDF().registerTempTable("MovieDate") //flatten every word into a new line in the RDD val flatmappedinput = input.flatMap(line => line.split(";") ) flatmappedinput.toDF().registerTempTable("flatinput") // read input directly to dataframe val inputasdf = spark.read.format("csv").option("header", "true").option("delimiter", ";").load("/notebook/Movies.txt") inputasdf.registerTempTable("inputdf") /* //Use this to store the dataframe as parquet on the local drive val reduceddf = reduceddate.toDF() reduceddf.write.parquet("/notebook/movie.parquet") */ //read the parquetfile val parquetFileDF = spark.read.parquet("/notebook/movie.parquet") parquetFileDF.registerTempTable("ParquetRead") //SparkSQL Queries: //Visualize the raw RDD %sql select * from MovieLine //Visualize the map reduced RDD with count of movies per year %sql select Line, myCount from MovieDate order by myCount desc //Visualize the maped RDD and count the nr. of movies per year in SparkSQL %sql select myDate, count(myDate) as counted from DateAndGenre group by myDate order by counted desc %sql select * from flatinput %sql select * from ParquetRead ================================================ FILE: Code Examples/GenAI-RAG/conversations.json ================================================ [ { "conversation_id": 456, "customer_name": "Alice Brown", "agent_name": "Emily Johnson", "policy_number": "ABC5678", "conversation": "Customer: Hi, my name is Alice Brown. Date of Birth is September 20th, 1980, Address is 456 Oak St, Springfield, IL 62701, and my Policy Number is XYZ9876543.\nAgent: Good afternoon, Alice. How may I assist you today?\nCustomer: Hello, Emily. I have a question regarding my coverage.\nCustomer: My kitchen caught fire, and I'm concerned about the damages.\nAgent: I'm sorry to hear that, Alice. Let me review your policy for fire damage coverage.\nAgent: It appears that fire damage is covered under your policy. We'll assist you with the claim process.\nCustomer: Thank you, Emily. I appreciate your help during this stressful time.\nAgent: You're welcome, Alice. We're here to support you. Please don't hesitate to reach out if you need further assistance.\nCustomer: I'll keep that in mind. Have a great day!\nAgent: You too, Alice. Take care.", "summary": "A customer inquires about policy coverage after a kitchen fire, expressing concern, and the agent confirms coverage and offers assistance, providing support and reassurance throughout the conversation." }, { "conversation_id": 789, "customer_name": "David Johnson", "agent_name": "Sarah Wilson", "policy_number": "LMN9012", "conversation": "Customer: Good morning, I'm David Johnson. My Date of Birth is May 5th, 1975, Address is 789 Maple Ave, Seattle, WA 98101, and my Policy Number is PQR3456789.\nAgent: Good morning, David. How can I assist you today?\nCustomer: Hi, Sarah. I'm concerned about my home insurance coverage.\nCustomer: A pipe burst in my basement, and there's significant water damage.\nAgent: I'm sorry to hear that, David. Let me check your policy for coverage related to water damage.\nAgent: It seems that water damage from burst pipes is covered under your policy.\nCustomer: That's a relief. I'll need to file a claim as soon as possible.\nAgent: We'll assist you with the claim process, David. Is there anything else I can help you with?\nCustomer: No, that's all for now. Thank you for your assistance, Sarah.\nAgent: You're welcome, David. Please feel free to reach out if you have any further questions or concerns.\nCustomer: I will. Have a great day!\nAgent: You too, David. Take care.", "summary": "A customer expresses concern about home insurance coverage due to water damage from a burst pipe, and the agent confirms coverage, offering assistance with the claim process, resulting in relief and gratitude expressed by the customer." }, { "conversation_id": 101, "customer_name": "Emily Green", "agent_name": "Jack Smith", "policy_number": "DEF4567", "conversation": "Customer: Hi there, I'm Emily Green. My Date of Birth is April 10th, 1988, Address is 101 Pine St, Boston, MA 02101, and my Policy Number is DEF4567.\nAgent: Hello, Emily. How can I assist you today?\nCustomer: Hi, Jack. I have a question about my policy.\nCustomer: A window in my living room shattered during a storm. Is this covered?\nAgent: Let me check your policy for coverage related to storm damage.\nAgent: Unfortunately, damage to windows from storms is not covered under your policy.\nCustomer: Oh, that's disappointing. Is there any way to add coverage for this?\nAgent: Yes, we offer endorsements for specific perils like storm damage to windows. I can provide you with more information on that.\nCustomer: Please do. I want to ensure I'm protected in case this happens again.\nAgent: I'll send you an email with details on our endorsement options. Feel free to reach out if you have any further questions.\nCustomer: Thank you, Jack. I appreciate your help.\nAgent: You're welcome, Emily. Have a great day!", "summary": "A customer inquires about coverage for a shattered window after a storm, but it's not covered under the policy. The agent suggests adding endorsements for specific perils like storm damage to windows, providing further information and assistance, resulting in the customer's appreciation." }, { "conversation_id": 102, "customer_name": "Michael White", "agent_name": "Sarah Johnson", "policy_number": "GHI7890", "conversation": "Customer: Good afternoon, I'm Michael White. My Date of Birth is February 25th, 1970, Address is 202 Elm St, Chicago, IL 60601, and my Policy Number is GHI7890.\nAgent: Good afternoon, Michael. How may I assist you today?\nCustomer: Hi, Sarah. I have a question about my policy coverage.\nCustomer: My roof has started leaking after heavy rainfall. Will my insurance cover repairs?\nAgent: Let me review your policy for coverage related to roof leaks.\nAgent: Roof leaks due to rain are typically covered under your policy.\nCustomer: That's a relief. I'll need to schedule repairs as soon as possible.\nAgent: We'll assist you with the claim process, Michael. Is there anything else I can help you with?\nCustomer: No, that's all for now. Thank you for your assistance, Sarah.\nAgent: You're welcome, Michael. Please feel free to reach out if you have any further questions or concerns.\nCustomer: I will. Have a great day!\nAgent: You too, Michael. Take care.", "summary": "A customer seeks clarification on policy coverage for a leaking roof after heavy rainfall, and the agent confirms that such damages are typically covered under the policy. The agent offers assistance with the claim process, resulting in the customer expressing relief and gratitude." }, { "conversation_id": 103, "customer_name": "Sophia Jones", "agent_name": "Emily Wilson", "policy_number": "JKL0123", "conversation": "Customer: Hi, I'm Sophia Jones. My Date of Birth is November 15th, 1985, Address is 303 Cedar St, Miami, FL 33101, and my Policy Number is JKL0123.\nAgent: Hello, Sophia. How may I assist you today?\nCustomer: Hello, Emily. I have a question about my policy.\nCustomer: There's been a break-in at my home, and some valuable items are missing. Are they covered?\nAgent: Let me check your policy for coverage related to theft.\nAgent: Yes, theft of personal belongings is covered under your policy.\nCustomer: That's a relief. I'll need to file a claim for the stolen items.\nAgent: We'll assist you with the claim process, Sophia. Is there anything else I can help you with?\nCustomer: No, that's all for now. Thank you for your assistance, Emily.\nAgent: You're welcome, Sophia. Please feel free to reach out if you have any further questions or concerns.\nCustomer: I will. Have a great day!\nAgent: You too, Sophia. Take care.", "summary": "A customer inquires about coverage for stolen items after a break-in at home, and the agent confirms that theft of personal belongings is covered under the policy. The agent offers assistance with the claim process, resulting in the customer expressing relief and gratitude." }, { "conversation_id": 104, "customer_name": "Ethan Wilson", "agent_name": "Jack Brown", "policy_number": "MNO3456", "conversation": "Customer: Hello, I'm Ethan Wilson. My Date of Birth is July 5th, 1995, Address is 404 Oak St, Los Angeles, CA 90001, and my Policy Number is MNO3456.\nAgent: Good morning, Ethan. How may I assist you today?\nCustomer: Hi, Jack. I have a question regarding my policy.\nCustomer: My garage door was damaged in a storm. Is this covered?\nAgent: Let me review your policy for coverage related to storm damage.\nAgent: Yes, damage to the garage door from storms is covered under your policy.\nCustomer: That's a relief. I'll need to schedule repairs as soon as possible.\nAgent: We'll assist you with the claim process, Ethan. Is there anything else I can help you with?\nCustomer: No, that's all for now. Thank you for your assistance, Jack.\nAgent: You're welcome, Ethan. Please feel free to reach out if you have any further questions or concerns.\nCustomer: I will. Have a great day!\nAgent: You too, Ethan. Take care.", "summary": "A customer inquires about coverage for a damaged garage door after a storm, and the agent confirms that such damages are covered under the policy. The agent offers assistance with the claim process, resulting in the customer expressing relief and gratitude." }, { "conversation_id": 105, "customer_name": "Olivia Taylor", "agent_name": "Sarah Smith", "policy_number": "PQR7890", "conversation": "Customer: Hi there, I'm Olivia Taylor. My Date of Birth is December 30th, 1990, Address is 505 Pine St, San Francisco, CA 94101, and my Policy Number is PQR7890.\nAgent: Good afternoon, Olivia. How may I assist you today?\nCustomer: Hi, Sarah. I have a question regarding my policy.\nCustomer: A tree in my backyard has fallen and damaged my fence. Will my insurance cover repairs?\nAgent: Let me check your policy for coverage related to fallen trees.\nAgent: Yes, damage to the fence from fallen trees is covered under your policy.\nCustomer: That's a relief. I'll need to schedule repairs as soon as possible.\nAgent: We'll assist you with the claim process, Olivia. Is there anything else I can help you with?\nCustomer: No, that's all for now. Thank you for your assistance, Sarah.\nAgent: You're welcome, Olivia. Please feel free to reach out if you have any further questions or concerns.\nCustomer: I will. Have a great day!\nAgent: You too, Olivia. Take care.", "summary": "A customer inquires about coverage for a damaged fence due to a fallen tree, and the agent confirms that such damages are covered under the policy. The agent offers assistance with the claim process, resulting in the customer expressing relief and gratitude." }, { "conversation_id": 106, "customer_name": "William Anderson", "agent_name": "Jack Johnson", "policy_number": "STU2345", "conversation": "Customer: Hello, I'm William Anderson. My Date of Birth is August 20th, 1980, Address is 606 Elm St, Dallas, TX 75201, and my Policy Number is STU2345.\nAgent: Good morning, William. How may I assist you today?\nCustomer: Hi, Jack. I have a question about my policy.\nCustomer: My basement flooded during heavy rainfall. Is water damage covered?\nAgent: Let me review your policy for coverage related to water damage.\nAgent: Yes, water damage from flooding is covered under your policy.\nCustomer: That's a relief. I'll need to schedule repairs as soon as possible.\nAgent: We'll assist you with the claim process, William. Is there anything else I can help you with?\nCustomer: No, that's all for now. Thank you for your assistance, Jack.\nAgent: You're welcome, William. Please feel free to reach out if you have any further questions or concerns.\nCustomer: I will. Have a great day!\nAgent: You too, William. Take care.", "summary": "A customer inquires about coverage for water damage after a basement flooding, and the agent confirms that such damages are covered under the policy. The agent offers assistance with the claim process, resulting in the customer expressing relief and gratitude." }, { "conversation_id": 123, "customer_name": "Alice Smith", "agent_name": "Emily Johnson", "policy_number": "ABC5678", "conversation": "Customer: Hi, my name is Alice Smith, Date of Birth is Feb 15th 1985, Address is 123 Main St, Anytown, NY 12345, and my Policy Number is XYZ9876.\nAgent: Hello, Alice. How can I assist you today?\nCustomer: I have a question about my home insurance coverage.\nCustomer: I noticed some water damage in my basement, and I'm not sure if it's covered.\nAgent: I'm sorry to hear about the damage. Let me review your policy to see what's covered.\nAgent: Based on your policy, water damage from burst pipes is covered, but it depends on the cause of the damage.\nCustomer: What if it's from heavy rainfall or flooding?\nAgent: Unfortunately, damage from flooding is typically not covered under standard home insurance policies.\nCustomer: That's disappointing. Is there anything I can do to get coverage for flooding?\nAgent: You may want to consider purchasing a separate flood insurance policy to ensure you're protected.\nCustomer: I see. Thank you for your help.\nAgent: You're welcome, Alice. If you have any further questions, feel free to ask.", "summary": "A customer inquires about home insurance coverage for water damage in the basement, and the agent confirms that damage from burst pipes is covered but explains that flooding is typically not covered under standard policies. The agent advises the customer to consider purchasing a separate flood insurance policy for protection, resulting in the customer expressing gratitude for the assistance provided." }, { "conversation_id": 124, "customer_name": "Michael Johnson", "agent_name": "Sarah Brown", "policy_number": "DEF1234", "conversation": "Customer: Hi there, my name is Michael Johnson, Date of Birth is May 10th 1978, Address is 456 Oak St, Smalltown, CA 98765, and my Policy Number is QRS5678.\nAgent: Good afternoon, Michael. How can I help you today?\nCustomer: I'm having an issue with my home insurance policy.\nCustomer: There's been some damage to my roof due to a recent storm, and I'm not sure if it's covered.\nAgent: I'm sorry to hear about the damage. Let me check your policy to provide you with accurate information.\nAgent: According to your policy, damage caused by storms, including wind and hail damage to your roof, should be covered.\nCustomer: That's a relief to hear. What do I need to do next?\nAgent: You'll need to file a claim with your insurance company and provide documentation of the damage, such as photos or repair estimates.\nCustomer: Okay, I'll get started on that right away.\nAgent: If you need any assistance with the claims process, feel free to reach out to us for help.\nCustomer: Thank you for your assistance.\nAgent: You're welcome, Michael. Have a great day!", "summary": "A customer reports damage to their roof caused by a recent storm and seeks clarification on coverage under their home insurance policy. The agent confirms that such damage is typically covered, advises the customer to file a claim with the insurance company, and offers assistance with the claims process, resulting in the customer expressing gratitude for the assistance provided." }, { "conversation_id": 125, "customer_name": "Jennifer Brown", "agent_name": "David Wilson", "policy_number": "GHI7890", "conversation": "Customer: Hello, I'm Jennifer Brown, born on March 20th, 1980, residing at 789 Elm St, Suburbia, TX 54321, and my Policy Number is LMN9012.\nAgent: Good morning, Jennifer. How can I assist you today?\nCustomer: Hi, I have a question about my home insurance coverage.\nCustomer: A pipe burst in my kitchen, and there's water damage everywhere.\nAgent: I'm sorry to hear about the incident. Let me check your policy to see what's covered.\nAgent: Based on your policy, sudden and accidental water damage, including burst pipes, should be covered.\nCustomer: That's a relief. What should I do next?\nAgent: You'll need to file a claim with your insurance company and provide documentation of the damage.\nCustomer: Okay, I'll do that right away. Thank you for your help.\nAgent: You're welcome, Jennifer. If you have any further questions, feel free to reach out.", "summary": "A customer reports water damage in the kitchen due to a burst pipe and seeks clarification on coverage under their home insurance policy. The agent confirms that sudden and accidental water damage, including burst pipes, should be covered, advises the customer to file a claim with the insurance company, and offers further assistance, resulting in the customer expressing gratitude for the help provided." }, { "conversation_id": 126, "customer_name": "Robert Johnson", "agent_name": "Michelle Adams", "policy_number": "PQR3456", "conversation": "Customer: Hi, my name is Robert Johnson, DOB is July 5th, 1976, and I live at 456 Maple Ave, Cityville, OH 67890. My Policy Number is STU2345.\nAgent: Hello, Robert. How can I assist you today?\nCustomer: I have a concern about my home insurance policy.\nCustomer: My neighbor's tree fell on my fence during the storm, causing damage.\nAgent: I'm sorry to hear about the damage. Let me review your policy to see if it's covered.\nAgent: Unfortunately, damage caused by your neighbor's tree falling on your fence may not be covered under your policy.\nCustomer: That's disappointing. Is there anything I can do to get coverage?\nAgent: You may want to speak with your neighbor about their homeowner's insurance policy, as their coverage may apply to this situation.\nCustomer: I'll do that. Thank you for your assistance.\nAgent: You're welcome, Robert. If you have any further questions, don't hesitate to ask.", "summary": "A customer expresses concern about damage to their fence caused by a neighbor's tree falling during a storm and seeks clarification on coverage under their home insurance policy. The agent advises that such damage may not be covered under the customer's policy and suggests contacting the neighbor's homeowner's insurance for potential coverage, resulting in the customer expressing gratitude for the assistance provided." }, { "conversation_id": 127, "customer_name": "Emily Davis", "agent_name": "Daniel Miller", "policy_number": "UVW4567", "conversation": "Customer: Hi, I'm Emily Davis, born on September 12th, 1982, residing at 789 Pine St, Hilltown, FL 45678. My Policy Number is XYZ7890.\nAgent: Good afternoon, Emily. How can I assist you today?\nCustomer: Hello, I need to make a change to my home insurance policy.\nCustomer: I recently renovated my kitchen, and I need to update the coverage to reflect the changes.\nAgent: I can assist you with that. Let me update your policy with the new information.\nAgent: Your policy has been updated to reflect the renovation. Is there anything else I can help you with?\nCustomer: That's all for now. Thank you for your help.\nAgent: You're welcome, Emily. If you have any further questions or need assistance in the future, feel free to reach out.", "summary": "A customer requests a change to their home insurance policy to reflect recent renovations to their kitchen. The agent assists with updating the policy accordingly, and the customer expresses gratitude for the help provided." }, { "conversation_id": 128, "customer_name": "Jessica Wilson", "agent_name": "Ryan Thompson", "policy_number": "WXY6789", "conversation": "Customer: Hello, I'm Jessica Wilson, DOB is April 30th, 1974, and I live at 234 Oak St, Suburbia, CA 98765. My Policy Number is ABC1234.\nAgent: Good morning, Jessica. How can I assist you today?\nCustomer: Hi, I need to add an additional coverage to my home insurance policy.\nCustomer: I recently purchased some expensive jewelry, and I want to make sure it's covered in case of theft or loss.\nAgent: I can help you with that. Let me add a rider to your policy to cover the additional jewelry.\nAgent: Your policy has been updated to include coverage for your jewelry. Is there anything else I can assist you with?\nCustomer: That's all for now. Thank you for your help.\nAgent: You're welcome, Jessica. If you have any further questions or need assistance in the future, feel free to reach out.", "summary": "A customer requests to add additional coverage to their home insurance policy for recently purchased expensive jewelry to ensure protection against theft or loss. The agent assists by adding a rider to the policy for the additional coverage, and the customer expresses gratitude for the help provided." }, { "conversation_id": 129, "customer_name": "Andrew Brown", "agent_name": "Sophia Martinez", "policy_number": "JKL2345", "conversation": "Customer: Hi there, I'm Andrew Brown, born on November 25th, 1986, residing at 345 Cedar St, Smalltown, TX 67890. My Policy Number is DEF5678.\nAgent: Good afternoon, Andrew. How can I assist you today?\nCustomer: Hello, I need to update my contact information on my home insurance policy.\nCustomer: I recently moved, and I need to provide my new address and phone number.\nAgent: I can assist you with that. Let me update your contact information in our system.\nAgent: Your contact information has been updated. Is there anything else I can help you with?\nCustomer: That's all for now. Thank you for your help.\nAgent: You're welcome, Andrew. If you have any further questions or need assistance in the future, feel free to reach out.", "summary": "A customer requests to update their contact information on their home insurance policy due to a recent move. The agent assists by updating the customer's address and phone number in the system, and the customer expresses gratitude for the help provided." }, { "conversation_id": 130, "customer_name": "Michelle Evans", "agent_name": "Jacob Clark", "policy_number": "MNO7890", "conversation": "Customer: Hi, I'm Michelle Evans, DOB is June 15th, 1979, and I live at 567 Elm St, Cityville, NY 23456. My Policy Number is PQR9012.\nAgent: Good morning, Michelle. How can I assist you today?\nCustomer: Hello, I need to cancel my home insurance policy.\nCustomer: I'm selling my house, so I no longer need coverage.\nAgent: I can assist you with that. Let me process the cancellation for you.\nAgent: Your home insurance policy has been cancelled, effective immediately. Is there anything else I can help you with?\nCustomer: That's all, thank you for your help.\nAgent: You're welcome, Michelle. If you have any further questions or need assistance in the future, feel free to reach out.", "summary": "A customer requests to cancel their home insurance policy as they are selling their house and no longer require coverage. The agent assists by processing the cancellation, and the customer expresses gratitude for the help provided." }, { "conversation_id": 131, "customer_name": "David Garcia", "agent_name": "Emma Moore", "policy_number": "RST9012", "conversation": "Customer: Hi, I'm David Garcia, born on August 8th, 1988, residing at 789 Maple St, Suburbia, CA 34567. My Policy Number is UVW1234.\nAgent: Good morning, David. How can I assist you today?\nCustomer: Hello, I need to inquire about adding a home office coverage to my policy.\nCustomer: I recently started working from home and have valuable equipment that I want to protect.\nAgent: I understand. Let me check your policy to see what options are available.\nAgent: It appears that we offer a home business coverage option that may suit your needs.\nCustomer: That sounds perfect. Please add it to my policy.\nAgent: Your policy has been updated to include home business coverage. Is there anything else I can help you with?\nCustomer: That's all for now. Thank you for your assistance.\nAgent: You're welcome, David. If you have any further questions or need assistance in the future, feel free to reach out.", "summary": "A customer requests to add home office coverage to their policy as they recently started working from home and want to protect valuable equipment. The agent confirms the availability of a home business coverage option and assists by adding it to the policy, resulting in the customer expressing gratitude for the help provided." }, { "conversation_id": 132, "customer_name": "Sarah Hernandez", "agent_name": "John Lee", "policy_number": "LMN3456", "conversation": "Customer: Hi there, I'm Sarah Hernandez, born on January 12th, 1983, residing at 123 Cedar St, Hilltown, TX 12345. My Policy Number is GHI6789.\nAgent: Good afternoon, Sarah. How can I assist you today?\nCustomer: Hello, I recently got a pet dog and wanted to know if it affects my home insurance policy.\nCustomer: I heard that some breeds are considered high-risk and may affect coverage.\nAgent: Let me check your policy and see how pets are addressed.\nAgent: According to your policy, owning a dog may affect your liability coverage.\nCustomer: What do I need to do to ensure my coverage remains intact?\nAgent: You may need to disclose the breed and any history of aggression to your insurance company.\nCustomer: I'll do that. Thank you for your help.\nAgent: You're welcome, Sarah. If you have any further questions or need assistance in the future, feel free to reach out.", "summary": "A customer inquires about the impact of getting a pet dog on their home insurance policy, concerned about potential breed-related issues. The agent checks the policy and explains that owning a dog may affect liability coverage, advising the customer to disclose breed information and any history of aggression to the insurance company to ensure coverage remains intact, resulting in the customer expressing gratitude for the assistance provided." }, { "conversation_id": 133, "customer_name": "Christopher Martinez", "agent_name": "Olivia Taylor", "policy_number": "OPQ4567", "conversation": "Customer: Hi, I'm Christopher Martinez, DOB is April 5th, 1980, and I live at 456 Walnut St, Smalltown, NY 89012. My Policy Number is JKL7890.\nAgent: Good morning, Christopher. How can I assist you today?\nCustomer: Hello, I need to renew my home insurance policy.\nCustomer: My policy is expiring soon, and I want to ensure continuous coverage.\nAgent: Let me check your policy renewal options and provide you with the necessary information.\nAgent: Your policy renewal options have been reviewed, and I can assist you with the renewal process.\nCustomer: That's great. Please proceed with the renewal.\nAgent: Your policy has been successfully renewed. Is there anything else I can help you with?\nCustomer: That's all for now. Thank you for your assistance.\nAgent: You're welcome, Christopher. If you have any further questions or need assistance in the future, feel free to reach out.", "summary": "A customer requests to renew their home insurance policy as it is expiring soon, seeking continuous coverage. The agent reviews renewal options, assists with the renewal process, and confirms successful renewal, resulting in the customer expressing gratitude for the assistance provided." }, { "conversation_id": 134, "customer_name": "Amy Thompson", "agent_name": "William Davis", "policy_number": "CDE7890", "conversation": "Customer: Hi, I'm Amy Thompson, born on October 18th, 1984, residing at 789 Birch St, Suburbia, CA 23456. My Policy Number is EFG1234.\nAgent: Good afternoon, Amy. How can I assist you today?\nCustomer: Hello, I need to report a claim for damage to my home.\nCustomer: There was a fire in my kitchen, and there's significant damage.\nAgent: I'm sorry to hear about the fire. Let me assist you with filing a claim.\nAgent: Your claim has been initiated, and an adjuster will contact you shortly for further assistance.\nCustomer: Thank you for your help.\nAgent: You're welcome, Amy. If you have any further questions or need assistance in the future, feel free to reach out.", "summary": "A customer reports a claim for damage to their home due to a fire in the kitchen, seeking assistance with the claims process. The agent initiates the claim and assures the customer that an adjuster will contact them shortly for further assistance, resulting in the customer expressing gratitude for the help provided." }, { "conversation_id": 135, "customer_name": "Linda Wilson", "agent_name": "Michael Brown", "policy_number": "FGH9012", "conversation": "Customer: Hi, I'm Linda Wilson, born on June 25th, 1975, residing at 234 Pine St, Cityville, TX 56789. My Policy Number is IJK2345.\nAgent: Good morning, Linda. How can I assist you today?\nCustomer: Hello, I'm extremely disappointed with the service I've received from your company.\nCustomer: I filed a claim for water damage a month ago, and I still haven't received any updates.\nAgent: I apologize for the delay in processing your claim, Linda. Let me investigate the status for you.\nAgent: It appears that there was an oversight in processing your claim. I will expedite the review process and provide you with an update shortly.\nCustomer: This is unacceptable. I expect better service from my insurance provider.\nAgent: I completely understand your frustration, Linda. Rest assured, I will do everything in my power to resolve this matter promptly.\nCustomer: I hope so. I've been a loyal customer for years, and this experience has been disappointing.\nAgent: I sincerely apologize for the inconvenience, Linda. I'll keep you updated on the progress of your claim.\nCustomer: Thank you.", "summary": "A customer expresses extreme disappointment with the service received from the company, citing a delay in processing a claim for water damage filed a month ago. The agent acknowledges the oversight, apologizes for the inconvenience, and assures the customer of expedited review and updates on the claim's progress, with the customer expressing hope for a resolution and gratitude for the attention to the matter." }, { "conversation_id": 136, "customer_name": "Brian Adams", "agent_name": "Jessica Miller", "policy_number": "KLM3456", "conversation": "Customer: Hi, I'm Brian Adams, DOB is December 10th, 1982, and I live at 345 Oak St, Hilltown, CA 78901. My Policy Number is NOP4567.\nAgent: Good afternoon, Brian. How can I assist you today?\nCustomer: Hello, I'm beyond frustrated with your company's billing practices.\nCustomer: I received a notice stating that my premium has increased significantly without any explanation.\nAgent: I apologize for the inconvenience, Brian. Let me review your policy to understand the reason for the increase.\nAgent: It appears that there was an error in the calculation of your premium. I will escalate this issue to our billing department and ensure it's rectified immediately.\nCustomer: This is unacceptable. I expect transparency and fairness from my insurance provider.\nAgent: I completely understand your frustration, Brian. Rest assured, I will personally oversee the resolution of this matter and keep you updated on the progress.\nCustomer: I appreciate your assistance, but this shouldn't have happened in the first place.\nAgent: I apologize once again, Brian. I'll ensure that corrective measures are put in place to prevent similar issues in the future.\nCustomer: I hope so.", "summary": "A customer expresses frustration with the company's billing practices, citing a significant increase in premiums without explanation. The agent apologizes for the inconvenience, acknowledges the error in premium calculation, and assures the customer of immediate escalation and resolution, with the customer emphasizing the expectation of transparency and fairness from their insurance provider and the agent expressing commitment to preventive measures to avoid similar issues in the future." }, { "conversation_id": 137, "customer_name": "Karen Garcia", "agent_name": "Richard Martinez", "policy_number": "QRS5678", "conversation": "Customer: Hi, I'm Karen Garcia, born on September 5th, 1979, residing at 456 Cedar St, Smalltown, NY 34567. My Policy Number is TUV6789.\nAgent: Good morning, Karen. How can I assist you today?\nCustomer: Hello, I'm extremely dissatisfied with your company's claims handling process.\nCustomer: I filed a claim for roof damage three weeks ago, and there's been no progress or communication since then.\nAgent: I apologize for the lack of updates, Karen. Let me investigate the status of your claim and provide you with an update.\nAgent: It appears that there was a delay in processing your claim due to a backlog. I will expedite the review process and ensure you receive a timely resolution.\nCustomer: This is unacceptable. I've been left in the dark for too long, and it's causing me a lot of stress.\nAgent: I understand your frustration, Karen. Rest assured, I will personally oversee the handling of your claim and keep you informed every step of the way.\nCustomer: I expect better from my insurance provider. This level of service is unacceptable.\nAgent: I apologize for the inconvenience, Karen. I'll do everything in my power to address your concerns and ensure a satisfactory outcome.\nCustomer: I hope so.", "summary": "A customer expresses extreme dissatisfaction with the company's claims handling process, citing a lack of progress and communication regarding a filed claim for roof damage. The agent apologizes for the inconvenience, acknowledges the delay due to a backlog, and assures the customer of expedited review and personal oversight to ensure timely resolution, with the customer emphasizing the expectation of better service and the agent expressing commitment to addressing concerns and achieving a satisfactory outcome." }, { "conversation_id": 138, "customer_name": "Jason Miller", "agent_name": "Michelle Harris", "policy_number": "VWX7890", "conversation": "Customer: Hi, I'm Jason Miller, DOB is November 15th, 1983, and I live at 567 Elm St, Suburbia, CA 45678. My Policy Number is YZA8901.\nAgent: Good afternoon, Jason. How can I assist you today?\nCustomer: Hello, I'm furious with your company's lack of responsiveness.\nCustomer: I've been trying to contact your claims department for days, but I keep getting transferred and put on hold.\nAgent: I apologize for the inconvenience, Jason. Let me escalate your issue to a supervisor for immediate assistance.\nAgent: A supervisor will contact you shortly to address your concerns and ensure a prompt resolution.\nCustomer: This is unacceptable. I expect better customer service from my insurance provider.\nAgent: I completely understand your frustration, Jason. Rest assured, we will do everything in our power to rectify the situation and regain your trust.\nCustomer: I hope so. This experience has been extremely frustrating and disappointing.\nAgent: I sincerely apologize for the inconvenience, Jason. We value your feedback, and we're committed to improving our service standards.\nCustomer: I appreciate that.", "summary": "A customer expresses fury over the company's lack of responsiveness, stating difficulties in contacting the claims department despite attempts over several days. The agent apologizes, escalates the issue to a supervisor for immediate assistance, and assures the customer of efforts to rectify the situation and regain trust, with the customer emphasizing the expectation of better customer service and the agent expressing commitment to improvement and appreciation for the feedback." }, { "conversation_id": 139, "customer_name": "Rachel Clark", "agent_name": "Daniel Wilson", "policy_number": "BCD1234", "conversation": "Customer: Hi, I'm Rachel Clark, born on February 20th, 1981, residing at 678 Walnut St, Cityville, TX 89012. My Policy Number is EFG2345.\nAgent: Good morning, Rachel. How can I assist you today?\nCustomer: Hello, I'm extremely disappointed with your company's claims denial decision.\nCustomer: I filed a claim for water damage, and it was denied without any explanation.\nAgent: I apologize for the frustration, Rachel. Let me review the details of your claim and the reason for the denial.\nAgent: It appears that the damage was deemed to be the result of gradual wear and tear, which is not covered under your policy.\nCustomer: This is unacceptable. I've been paying premiums for years, expecting coverage when I need it most.\nAgent: I understand your frustration, Rachel. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough review of my claim and a fair decision. This denial has caused me a lot of stress.\nAgent: I'll ensure that your claim is reevaluated promptly, Rachel. I apologize for any inconvenience this has caused.\nCustomer: Thank you.", "summary": "A customer expresses extreme disappointment with the company's claims denial decision regarding water damage, citing lack of explanation. The agent apologizes, reviews the claim details, and explains that the denial was due to damage deemed gradual wear and tear, not covered under the policy. The customer emphasizes the expectation of coverage after years of premium payments, and the agent escalates the concerns for further review, promising a thorough reevaluation and apologizing for any inconvenience caused." }, { "conversation_id": 140, "customer_name": "Emily Rodriguez", "agent_name": "David Garcia", "policy_number": "LMN5678", "conversation": "Customer: Hi, I'm Emily Rodriguez, born on April 8th, 1986, residing at 789 Birch St, Hilltown, CA 23456. My Policy Number is OPQ6789.\nAgent: Good morning, Emily. How can I assist you today?\nCustomer: Hello, I'm extremely frustrated with your company's decision to deny my claim.\nCustomer: I filed a claim for damage caused by a fallen tree, and it was denied without any explanation.\nAgent: I understand your frustration, Emily. Let me review the details of your claim and provide you with an explanation.\nAgent: It appears that the damage was deemed to be the result of an excluded peril, which is not covered under your policy.\nCustomer: This is unacceptable. I've been paying premiums for years, expecting coverage when I need it most.\nAgent: I apologize for the inconvenience, Emily. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough review of my claim and a fair decision. This denial has caused me a lot of stress.\nAgent: I'll ensure that your claim is reevaluated promptly, Emily. I apologize for any inconvenience this has caused.\nCustomer: Thank you.", "summary": "Customer expresses frustration with claim denial for tree damage, demands explanation. Agent apologizes, cites damage as excluded peril, promises review. Customer stresses expectation of coverage, agent escalates concerns for thorough reevaluation, apologizes for inconvenience." }, { "conversation_id": 141, "customer_name": "Matthew Lopez", "agent_name": "Emma Wilson", "policy_number": "RST7890", "conversation": "Customer: Hi, I'm Matthew Lopez, DOB is October 12th, 1984, and I live at 456 Cedar St, Smalltown, NY 34567. My Policy Number is TUV8901.\nAgent: Good afternoon, Matthew. How can I assist you today?\nCustomer: Hello, I'm extremely disappointed with your company's decision to deny my claim.\nCustomer: I filed a claim for water damage, and it was denied without any explanation.\nAgent: I understand your frustration, Matthew. Let me review the details of your claim and provide you with an explanation.\nAgent: It appears that the damage was deemed to be the result of a maintenance issue, which is not covered under your policy.\nCustomer: This is unacceptable. I've been paying premiums for years, expecting coverage when I need it most.\nAgent: I apologize for the inconvenience, Matthew. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough review of my claim and a fair decision. This denial has caused me a lot of stress.\nAgent: I'll ensure that your claim is reevaluated promptly, Matthew. I apologize for any inconvenience this has caused.\nCustomer: Thank you.", "summary": "Customer expresses disappointment with claim denial for water damage, demands explanation. Agent apologizes, cites damage as maintenance issue, promises review. Customer stresses expectation of coverage, agent escalates concerns for thorough reevaluation, apologizes for inconvenience." }, { "conversation_id": 142, "customer_name": "Amanda Thompson", "agent_name": "Michael Johnson", "policy_number": "UVW9012", "conversation": "Customer: Hi, I'm Amanda Thompson, born on March 15th, 1983, residing at 567 Oak St, Suburbia, CA 67890. My Policy Number is XYZ0123.\nAgent: Good morning, Amanda. How can I assist you today?\nCustomer: Hello, I'm extremely frustrated with your company's decision to deny my claim.\nCustomer: I filed a claim for theft of personal belongings, and it was denied without any explanation.\nAgent: I understand your frustration, Amanda. Let me review the details of your claim and provide you with an explanation.\nAgent: It appears that the theft was deemed to be the result of negligence, which is not covered under your policy.\nCustomer: This is unacceptable. I've been paying premiums for years, expecting coverage when I need it most.\nAgent: I apologize for the inconvenience, Amanda. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough review of my claim and a fair decision. This denial has caused me a lot of stress.\nAgent: I'll ensure that your claim is reevaluated promptly, Amanda. I apologize for any inconvenience this has caused.\nCustomer: Thank you.", "summary": "Customer frustrated by claim denial for theft of personal belongings, seeks explanation. Agent apologizes, attributes theft to negligence, promises review. Customer emphasizes expectation of coverage, agent escalates concerns for thorough reevaluation, apologizes for inconvenience." }, { "conversation_id": 143, "customer_name": "Jennifer Lee", "agent_name": "Olivia Brown", "policy_number": "CDE2345", "conversation": "Customer: Hi, I'm Jennifer Lee, born on August 20th, 1980, residing at 678 Pine St, Cityville, TX 45678. My Policy Number is EFG3456.\nAgent: Good afternoon, Jennifer. How can I assist you today?\nCustomer: Hello, I'm extremely disappointed with your company's decision to deny my claim.\nCustomer: I filed a claim for fire damage, and it was denied without any explanation.\nAgent: I understand your frustration, Jennifer. Let me review the details of your claim and provide you with an explanation.\nAgent: It appears that the fire was deemed to be the result of arson, which is not covered under your policy.\nCustomer: This is unacceptable. I've been paying premiums for years, expecting coverage when I need it most.\nAgent: I apologize for the inconvenience, Jennifer. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough review of my claim and a fair decision. This denial has caused me a lot of stress.\nAgent: I'll ensure that your claim is reevaluated promptly, Jennifer. I apologize for any inconvenience this has caused.\nCustomer: Thank you.", "summary": "Customer frustrated by claim denial for fire damage, seeks explanation. Agent attributes fire to arson, not covered under policy. Customer emphasizes expectation of coverage, agent escalates concerns for thorough reevaluation, apologizes for inconvenience." }, { "conversation_id": 140, "customer_name": "Emily White", "agent_name": "Andrew Thompson", "policy_number": "EFG2345", "conversation": "Customer: Hi, I'm Emily White, born on July 10th, 1980, residing at 789 Pine St, Hilltown, CA 56789. My Policy Number is HIJ3456.\nAgent: Good morning, Emily. How can I assist you today?\nCustomer: Hello, I'm extremely disappointed with your company's decision to deny my claim.\nCustomer: I filed a claim for water damage, but it was denied due to 'lack of timely notification.'\nAgent: I apologize for the inconvenience, Emily. Let me review the details of your claim denial.\nAgent: It appears that the damage occurred several weeks ago, and our policy requires claims to be reported within 72 hours.\nCustomer: This is ridiculous. I wasn't aware of the damage until recently, and I promptly filed the claim.\nAgent: I understand your frustration, Emily. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a fair evaluation of my claim. This denial has caused me a lot of stress and financial burden.\nAgent: I'll ensure that your claim is reevaluated promptly, Emily. I apologize for any inconvenience this has caused.\nCustomer: Thank you.", "summary": "Customer Emily White disappointed by claim denial for water damage due to 'lack of timely notification'. Agent attributes denial to damage reported beyond policy's 72-hour limit. Customer expresses frustration and financial burden. Agent apologizes and promises prompt reevaluation of the claim." }, { "conversation_id": 141, "customer_name": "James Rodriguez", "agent_name": "Sophia Martinez", "policy_number": "KLM4567", "conversation": "Customer: Hi, I'm James Rodriguez, DOB is March 15th, 1977, and I live at 456 Cedar St, Smalltown, TX 67890. My Policy Number is NOP5678.\nAgent: Good afternoon, James. How can I assist you today?\nCustomer: Hello, I'm extremely frustrated with your company's decision to deny my claim.\nCustomer: I filed a claim for hail damage to my roof, but it was denied due to 'pre-existing damage.'\nAgent: I apologize for the inconvenience, James. Let me review the details of your claim denial.\nAgent: It appears that there was evidence of prior damage to your roof, which was not covered under your policy.\nCustomer: This is outrageous. I had no knowledge of any pre-existing damage, and I've been paying premiums for years.\nAgent: I understand your frustration, James. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough investigation of my claim and a fair decision. This denial has caused me significant financial hardship.\nAgent: I'll ensure that your claim is reevaluated promptly, James. I apologize for any inconvenience this has caused.\nCustomer: Thank you.", "summary": "Customer James Rodriguez frustrated by claim denial for hail damage due to 'pre-existing damage'. Agent attributes denial to evidence of prior damage not covered by policy. Customer expresses outrage and financial hardship. Agent promises prompt reevaluation of the claim." }, { "conversation_id": 141, "customer_name": "James Rodriguez", "agent_name": "Sophia Martinez", "policy_number": "KLM4567", "conversation": "Customer: Hi, I'm James Rodriguez, DOB is March 15th, 1977, and I live at 456 Cedar St, Smalltown, TX 67890. My Policy Number is NOP5678.\nAgent: Good afternoon, James. How can I assist you today?\nCustomer: Hello, I'm extremely frustrated with your company's decision to deny my claim.\nCustomer: I filed a claim for hail damage to my roof, but it was denied due to 'pre-existing damage.'\nAgent: I apologize for the inconvenience, James. Let me review the details of your claim denial.\nAgent: It appears that there was evidence of prior damage to your roof, which was not covered under your policy.\nCustomer: This is outrageous. I had no knowledge of any pre-existing damage, and I've been paying premiums for years.\nAgent: I understand your frustration, James. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough investigation of my claim and a fair decision. This denial has caused me significant financial hardship.\nAgent: I'll ensure that your claim is reevaluated promptly, James. I apologize for any inconvenience this has caused.\nCustomer: Thank you.", "summary": " Customer disputes claim denial for hail damage, citing lack of awareness of pre-existing damage. Agent apologizes, attributing denial to evidence of prior damage not covered by the policy. Customer insists on thorough review and fair decision. Agent promises escalation for reevaluation." }, { "conversation_id": 142, "customer_name": "Melissa Thompson", "agent_name": "David Wilson", "policy_number": "PQR5678", "conversation": "Customer: Hi, I'm Melissa Thompson, born on December 5th, 1979, residing at 678 Elm St, Suburbia, NY 90123. My Policy Number is STU6789.\nAgent: Good morning, Melissa. How can I assist you today?\nCustomer: Hello, I'm extremely disappointed with your company's decision to deny my claim.\nCustomer: I filed a claim for fire damage to my garage, but it was denied due to 'policy exclusions.'\nAgent: I apologize for the inconvenience, Melissa. Let me review the details of your claim denial.\nAgent: It appears that damage caused by arson is specifically excluded from coverage under your policy.\nCustomer: This is infuriating. The fire was accidental, and I had nothing to do with it.\nAgent: I understand your frustration, Melissa. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a fair evaluation of my claim. This denial has caused me a lot of stress and financial hardship.\nAgent: I'll ensure that your claim is reevaluated promptly, Melissa. I apologize for any inconvenience this has caused.\nCustomer: Thank you.", "summary": "Customer disputes claim denial for fire damage, claiming it was accidental. Agent apologizes and explains policy exclusion for damage caused by arson. Customer insists on fair evaluation and expresses stress and financial hardship. Agent promises prompt reevaluation of the claim." }, { "conversation_id": 143, "customer_name": "Steven Lee", "agent_name": "Emma Moore", "policy_number": "UVW6789", "conversation": "Customer: Hi, I'm Steven Lee, DOB is August 20th, 1985, and I live at 789 Oak St, Cityville, CA 23456. My Policy Number is XYZ7890.\nAgent: Good afternoon, Steven. How can I assist you today?\nCustomer: Hello, I'm extremely frustrated with your company's decision to deny my claim.\nCustomer: I filed a claim for theft of personal belongings, but it was denied due to 'lack of evidence.'\nAgent: I apologize for the inconvenience, Steven. Let me review the details of your claim denial.\nAgent: It appears that there was insufficient evidence to support the claim of theft.\nCustomer: This is unacceptable. My belongings were stolen, and I provided all the necessary documentation.\nAgent: I understand your frustration, Steven. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough investigation of my claim and a fair decision. This denial has caused me significant financial loss.\nAgent: I'll ensure that your claim is reevaluated promptly, Steven. I apologize for any inconvenience this has caused.\nCustomer: Thank you.", "summary": "Customer disputes claim denial for theft of personal belongings due to lack of evidence. Agent apologizes and explains insufficient evidence for the claim. Customer insists on fair investigation and expresses financial loss. Agent promises prompt reevaluation of the claim." }, { "conversation_id": 144, "customer_name": "Nicole Brown", "agent_name": "John Davis", "policy_number": "LMN6789", "conversation": "Customer: Hi, I'm Nicole Brown, born on May 30th, 1983, residing at 123 Maple St, Suburbia, TX 45678. My Policy Number is ABC2345.\nAgent: Good morning, Nicole. How can I assist you today?\nCustomer: Hello, I'm extremely disappointed with your company's decision to deny my claim.\nCustomer: I filed a claim for storm damage to my fence, but it was denied due to 'acts of nature exclusion.'\nAgent: I apologize for the inconvenience, Nicole. Let me review the details of your claim denial.\nAgent: It appears that damage caused by storms, including wind and hail, is specifically excluded from coverage under your policy.\nCustomer: This is frustrating. I thought I was protected against such events.\nAgent: I understand your frustration, Nicole. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a fair evaluation of my claim. This denial has caused me a lot of stress and financial burden.\nAgent: I'll ensure that your claim is reevaluated promptly, Nicole. I apologize for any inconvenience this has caused.\nCustomer: Thank you.", "summary": "Customer's claim for storm damage to her fence is denied due to \"acts of nature exclusion.\" Agent apologizes and explains the policy's exclusion. Customer expresses frustration and financial burden. Agent promises a prompt reevaluation of the claim." } ] ================================================ FILE: Code Examples/GenAI-RAG/cvpipeline.py ================================================ # 2024-11-25 # Andreas Kretz # This code currently doesn't work because the preparation of the text for ElasticSearch doesn't work # Try to fix this and write the data import json, os # Importing JSON for handling JSON data and os for interacting with the operating system import fitz # PyMuPDF from llama_index.core import Document, Settings # Importing Document class and Settings for managing LlamaIndex from llama_index.core.node_parser import SentenceSplitter # Importing SentenceSplitter to split text into smaller chunks from llama_index.core.ingestion import IngestionPipeline # Importing IngestionPipeline for managing data ingestion from llama_index.embeddings.ollama import OllamaEmbedding # Importing OllamaEmbedding for generating text embeddings from llama_index.vector_stores.elasticsearch import ElasticsearchStore # Importing ElasticsearchStore for vector storage from dotenv import load_dotenv # Importing load_dotenv to load environment variables from a .env file from llama_index.core import VectorStoreIndex, QueryBundle, Response, Settings from llama_index.embeddings.ollama import OllamaEmbedding from llama_index.llms.ollama import Ollama from index_raw import es_vector_store from ollama import chat from ollama import ChatResponse # extract text form the pdf with PyMuPDF def extract_text_from_pdf(path): doc = fitz.open(path) text = "" for page_num in range(len(doc)): page = doc.load_page(page_num) page_text = page.get_text() text += page_text print(text) return text # feed the pdf into mistral and get a JSON back # this fails currently because I cannot get a good answer from mistral. the problem is with escaping \n and '. def prepare_text_to_json(text_to_summarize): instruction_template = "Here's a text. Encapsulate it into a json as a string and don't turn it into json attributes. Keep it flat. The attribute where the text should go into is called text. Create another attribute of the json called name and put the name of the person there:" response: ChatResponse = chat(model='mistral', messages=[ { 'role': 'user', 'content': instruction_template + text_to_summarize, }, ]) print(".....Prepared this json.....\n") print(response['message']['content']) return response['message']['content'] # Define an Elasticsearch vector store with configuration for local Elasticsearch es_vector_store = ElasticsearchStore( index_name="student_cvs", # Name of the Elasticsearch index vector_field='conversation_vector', # Field to store the vector representation of the text text_field='conversation', # Field to store the original text es_url="http://localhost:9200" # URL of the local Elasticsearch instance ) local_llm = Ollama(model="mistral") def main(): ollama_embedding = OllamaEmbedding("mistral") # Initialize the embedding model for generating embeddings using the "mistral" model # Set up an ingestion pipeline with transformations and the Elasticsearch vector store pipeline = IngestionPipeline( transformations=[ SentenceSplitter(chunk_size=350, chunk_overlap=50), # Split text into chunks of size 350 with 50 characters of overlap ollama_embedding, # Use the embedding model to generate embeddings for the chunks ], vector_store=es_vector_store # Use the configured Elasticsearch vector store ) extracted = extract_text_from_pdf('Liam_McGivney_CV.pdf') #extract the text from the CV prepped_json = prepare_text_to_json(extracted) # prepare the json #create a document (I think this is wrong right now) documents = Document(text=prepped_json['text'], metadata={"name": prepped_json['name']}) #documents = [Document(text=item['text']) for entry in prepped_json] #documents = [Document(text=item['text'], metadata={"name": item['name']}) for item in prepped_json] pipeline.run(documents=documents) # Run the pipeline to process documents and store embeddings in Elasticsearch print(".....Done running pipeline.....\n") # Print a completion message # Entry point of the script if __name__ == "__main__": main() # Call the main function ================================================ FILE: Code Examples/GenAI-RAG/docker-compose.yml ================================================ services: # Elasticsearch Docker Images: https://www.docker.elastic.co/ elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch:8.16.0 container_name: elasticsearch environment: - xpack.security.enabled=false - discovery.type=single-node ulimits: memlock: soft: -1 hard: -1 nofile: soft: 65536 hard: 65536 cap_add: - IPC_LOCK volumes: - elasticsearch-data17:/usr/share/elasticsearch/data ports: - 9200:9200 - 9300:9300 kibana: container_name: kibana image: docker.elastic.co/kibana/kibana:8.16.0 environment: - ELASTICSEARCH_HOSTS=http://elasticsearch:9200 ports: - 5601:5601 depends_on: - elasticsearch volumes: elasticsearch-data17: driver: local ================================================ FILE: Code Examples/GenAI-RAG/index.py ================================================ import json, os # Importing JSON for handling JSON data and os for interacting with the operating system from llama_index.core import Document, Settings # Importing Document class and Settings for managing LlamaIndex from llama_index.core.node_parser import SentenceSplitter # Importing SentenceSplitter to split text into smaller chunks from llama_index.core.ingestion import IngestionPipeline # Importing IngestionPipeline for managing data ingestion from llama_index.embeddings.ollama import OllamaEmbedding # Importing OllamaEmbedding for generating text embeddings from llama_index.vector_stores.elasticsearch import ElasticsearchStore # Importing ElasticsearchStore for vector storage from dotenv import load_dotenv # Importing load_dotenv to load environment variables from a .env file def get_documents_from_file(file): """Reads a JSON file and returns a list of Document objects""" # Open the JSON file in read-text mode with open(file=file, mode='rt') as f: conversations_dict = json.loads(f.read()) # Load the file contents into a Python dictionary # Create a list of Document objects using the 'conversation' field as text # and 'conversation_id' field as metadata documents = [Document(text=item['conversation'], metadata={"conversation_id": item['conversation_id']}) for item in conversations_dict] return documents # Return the list of Document objects # Define an Elasticsearch vector store with configuration for local Elasticsearch es_vector_store = ElasticsearchStore( index_name="calls", # Name of the Elasticsearch index vector_field='conversation_vector', # Field to store the vector representation of the text text_field='conversation', # Field to store the original text es_url="http://localhost:9200" # URL of the local Elasticsearch instance ) # Uncomment this if using Elastic Cloud and ensure ELASTIC_CLOUD_ID and ELASTIC_API_KEY are set in the .env file # Load the .env file contents into environment variables # This is used to access sensitive information like API keys or credentials # load_dotenv('.env') # es_vector_store = ElasticsearchStore( # index_name="calls", # Name of the Elasticsearch index # vector_field='conversation_vector', # Field for vector embeddings # text_field='conversation', # Field for storing original text # es_cloud_id=os.getenv("ELASTIC_CLOUD_ID"), # Cloud ID from the .env file # es_api_key=os.getenv("ELASTIC_API_KEY") # API key from the .env file # ) def main(): ollama_embedding = OllamaEmbedding("mistral") # Initialize the embedding model for generating embeddings using the "mistral" model # Set up an ingestion pipeline with transformations and the Elasticsearch vector store pipeline = IngestionPipeline( transformations=[ SentenceSplitter(chunk_size=350, chunk_overlap=50), # Split text into chunks of size 350 with 50 characters of overlap ollama_embedding, # Use the embedding model to generate embeddings for the chunks ], vector_store=es_vector_store # Use the configured Elasticsearch vector store ) documents = get_documents_from_file(file="conversations.json") # Load data from a JSON file and convert it to a list of Document objects pipeline.run(documents=documents) # Run the pipeline to process documents and store embeddings in Elasticsearch print(".....Done running pipeline.....\n") # Print a completion message # Entry point of the script if __name__ == "__main__": main() # Call the main function ================================================ FILE: Code Examples/GenAI-RAG/query.py ================================================ # query.py from llama_index.core import VectorStoreIndex, QueryBundle, Response, Settings from llama_index.embeddings.ollama import OllamaEmbedding from llama_index.llms.ollama import Ollama from index_raw import es_vector_store # Local LLM to send user query to local_llm = Ollama(model="mistral") # Initialize a local language model (LLM) using the "mistral" model from Ollama Settings.embed_model= OllamaEmbedding("mistral") # Create a VectorStoreIndex from the existing Elasticsearch vector store index = VectorStoreIndex.from_vector_store(es_vector_store) # Create a VectorStoreIndex from the existing Elasticsearch vector store query_engine = index.as_query_engine(local_llm, similarity_top_k=10) # Create a query engine from the index using the local LLM and set top-k similarity results to 10 # Define the query string for the question you want to ask the system you'll see that it has some problems understanding the context # Especially how to find the policy number from the person's name. #query="Give me summary of water related issues" #query="What policy number does emily green, born April 10th, 1988 have?" #query="Who has the policy number DEF4567" #query="What information about the person do you need to determin the policy number?" query="What policy number does emily green, living in 101 Pine St, Boston, MA 02101 have?" # Create a QueryBundle object, which packages the query and its embedding # The embedding is generated using the configured embedding model in Settings bundle = QueryBundle(query, embedding=Settings.embed_model.get_query_embedding(query)) # Use the query engine to execute the query bundle against the vector store # and retrieve the most relevant results result = query_engine.query(bundle) # Print the results of the query to the console print(result) ================================================ FILE: Code Examples/Movies.txt ================================================ Year;Length;Title;Subject;Actor;Actress;Director;Popularity;Awards;*Image INT;INT;STRING;CAT;CAT;CAT;CAT;INT;BOOL;STRING 1990;111;Tie Me Up! Tie Me Down!;Comedy;Banderas, Antonio;Abril, Victoria;Almodóvar, Pedro;68;No;NicholasCage.png 1991;113;High Heels;Comedy;Bosé, Miguel;Abril, Victoria;Almodóvar, Pedro;68;No;NicholasCage.png 1983;104;Dead Zone, The;Horror;Walken, Christopher;Adams, Brooke;Cronenberg, David;79;No;NicholasCage.png 1979;122;Cuba;Action;Connery, Sean;Adams, Brooke;Lester, Richard;6;No;seanConnery.png 1978;94;Days of Heaven;Drama;Gere, Richard;Adams, Brooke;Malick, Terrence;14;No;NicholasCage.png 1983;140;Octopussy;Action;Moore, Roger;Adams, Maud;Glen, John;68;No;NicholasCage.png 1984;101;Target Eagle;Action;Connors, Chuck;Adams, Maud;Loma, José Antonio de la;14;No;NicholasCage.png 1989;99;American Angels: Baptism of Blood, The;Drama;Bergen, Robert D.;Adams, Trudy;Sebastian, Beverly;28;No;NicholasCage.png 1985;104;Subway;Drama;Lambert, Christopher;Adjani, Isabelle;Besson, Luc;6;No;NicholasCage.png 1990;149;Camille Claudel;Drama;Depardieu, Gérard;Adjani, Isabelle;Nuytten, Bruno;32;No;NicholasCage.png 1982;188;Fanny and Alexander;Drama;Ahlstedt, Börje;Adolphson, Kristina;Bergman, Ingmar;81;Yes;Bergman.png 1982;117;Tragedy of a Ridiculous Man;Drama;Tognazzi, Ugo;Aimee, Anouk;Bertolucci, Bernardo;17;No;NicholasCage.png 1966;103;A Man & a Woman;Drama;Trintignant, Jean-Louis;Aimee, Anouk;Lelouch, Claude;46;Yes;NicholasCage.png 1986;112;A Man & a Woman: Twenty Years Later;Drama;Trintignant, Jean-Louis;Aimee, Anouk;Lelouch, Claude;49;No;NicholasCage.png 1966;103;Un Hombre y una Mujer;Drama;Trintignant, Jean-Louis;Aimee, Anouk;Lelouch, Claude;6;Yes;NicholasCage.png 1985;112;Official Story, The;Drama;Alterio, Hector;Aleandro, Norma;Puenzo, Luiz;39;Yes;NicholasCage.png 1976;150;Lindbergh Kidnapping Case, The;Drama;Hopkins, Anthony;Alexander, Denise;Kulik, Buzz;51;No;AnthonyHopkins.png 1929;84;Blackmail;Mystery;Longden, John;Algood, Sara;Hitchcock, Alfred;2;No;alfredHitchcock.png 1963;109;Donovan's Reef;Comedy;Wayne, John;Allen, Elizabeth;Ford, John;62;No;johnWayne.png 1988;110;Tucker: The Man & His Dream;Drama;Bridges, Jeff;Allen, Joan;Coppola, Francis Ford;68;No;NicholasCage.png 1988;101;Scrooged;Comedy;Murray, Bill;Allen, Karen;Donner, Richard;15;No;NicholasCage.png 1981;116;Raiders of the Lost Ark;Action;Ford, Harrison;Allen, Karen;Spielberg, Steven;8;No;NicholasCage.png 1987;101;Running Man, The;Science Fiction;Schwarzenegger, Arnold;Alonso, Maria Conchita;Glaser, Paul Michael;31;No;NicholasCage.png 1991;105;Predator 2;Action;Glover, Danny;Alonso, Maria Conchita;Hopkins, Stephen;79;No;NicholasCage.png 1988;127;Colors;Drama;Penn, Sean;Alonso, Maria Conchita;Hopper, Dennis;23;No;NicholasCage.png 1990;97;Zandalee;Drama;Cage, Nicolas;Anderson, Erika;Pillsbury, Sam;80;No;NicholasCage.png 1988;108;Miles from Home;Drama;Anderson, Kevin;Anderson, Jo;Sinise, Gary;53;No;NicholasCage.png 1980;;Happy Birthday to Me;Horror;Ford, Glenn;Anderson, Melissa Sue;Thompson, J. Lee;88;No;glennFord.png 1989;88;Final Notice;Mystery;Gerard, Gil;Anderson, Melody;Stern, Steven Hilliard;88;No;NicholasCage.png 1979;110;Quintet;Drama;Newman, Paul;Andersson, Bibi;Altman, Robert;19;No;paulNewman.png 1960;90;Devil's Eye, The;Drama;Kulle, Jarl;Andersson, Bibi;Bergman, Ingmar;20;No;Bergman.png 1957;91;Wild Strawberries;Drama;Sjöström, Victor;Andersson, Bibi;Bergman, Ingmar;42;Yes;Bergman.png 1956;96;Seventh Seal, The;Drama;Sydow, Max von;Andersson, Bibi;Bergman, Ingmar;62;No;Bergman.png 1992;90;Germicide;Drama;Taylor, Rod;Andersson, Bibi;;36;No;NicholasCage.png 1955;86;Dreams;Drama;Björnstrand, Gunnar;Andersson, Harriet;Bergman, Ingmar;14;No;Bergman.png 1955;95;Naked Night, The;Drama;Björnstrand, Gunnar;Andersson, Harriet;Bergman, Ingmar;38;No;Bergman.png 1962;91;Through a Glass Darkly;Drama;Björnstrand, Gunnar;Andersson, Harriet;Bergman, Ingmar;64;Yes;Bergman.png 1972;91;Cries & Whispers;Drama;Josephson, Erland;Andersson, Harriet;Bergman, Ingmar;18;Yes;Bergman.png 1958;104;Barbarian & the Geisha, The;Action;Wayne, John;Ando, Eiko;Huston, John;52;No;johnWayne.png 1967;130;Casino Royale;Comedy;Niven, David;Andress, Ursula;Hughes, Ken;11;No;NicholasCage.png 1962;;Dr. No;Action;Connery, Sean;Andress, Ursula;Young, Terence;7;No;seanConnery.png 1954;103;Elephant Walk;Drama;Finch, Peter;Andrews, Dana;;11;No;NicholasCage.png 1979;121;Ten;Comedy;Moore, Dudley;Andrews, Julie;Edwards, Blake;60;No;NicholasCage.png 1983;118;Man Who Loved Women, The;Comedy;Reynolds, Burt;Andrews, Julie;Edwards, Blake;67;No;NicholasCage.png 1966;190;Hawaii;Drama;Sydow, Max von;Andrews, Julie;Hill, George Roy;8;No;NicholasCage.png 1966;125;Torn Curtain;Mystery;Newman, Paul;Andrews, Julie;Hitchcock, Alfred;35;No;paulNewman.png 1986;107;Duet for One;Drama;Bates, Alan;Andrews, Julie;Konchalovsky, Andrei;82;No;NicholasCage.png 1965;172;Sound of Music, The;Music;Plummer, Christopher;Andrews, Julie;Wise, Robert;59;Yes;NicholasCage.png 1985;55;Gonzo Presents Muppet Weird Stuff;Comedy;Cleese, John;Andrews, Julie;;88;No;NicholasCage.png 1984;140;Tartuffe;Comedy;Depardieu, Gérard;Annen, Paule;Depardieu, Gérard;67;No;NicholasCage.png 1988;104;A New Life;Comedy;Alda, Alan;Ann-Margret;Alda, Alan;53;No;NicholasCage.png 1978;106;Magic;Mystery;Hopkins, Anthony;Ann-Margret;Attenborough, Richard;85;No;AnthonyHopkins.png 1992;286;Tommy;Music;Daltry, Roger;Ann-Margret;Russell, Ken;5;No;NicholasCage.png 1978;108;Big Fix, The;Mystery;Dreyfuss, Richard;Anspach, Susan;Kagan, Jeremy Paul;19;No;NicholasCage.png 1992;95;Alan & Naomi;Drama;Haas, Lukas;Aquino, Vanessa;Vanwagenen, Sterling;3;No;NicholasCage.png 1987;120;Fatal Attraction;Mystery;Douglas, Michael;Archer, Anne;Lyne, Adrian;61;No;NicholasCage.png 1992;117;Patriot Games;Action;Ford, Harrison;Archer, Anne;Noyce, Phillip;28;No;NicholasCage.png 1981;106;Woman Next Door, The;Drama;Depardieu, Gérard;Ardant, Fanny;Truffaut, François;82;No;NicholasCage.png 1992;97;Hunting;Mystery;Savage, John;Armstrong, Kerry;Howson, Frank;68;No;NicholasCage.png 1991;115;Bataan;War;Taylor, Robert;Arnaz, Desi;;68;No;NicholasCage.png 1924;110;Siegfried, The Nibelungenlied;Drama;Richter, Paul;Arnold, Gertrud;Lang, Fritz;79;No;NicholasCage.png 1991;90;Henry, Portrait of a Serial Killer;Horror;Rooker, Michael;Arnold, Tracy;;69;No;NicholasCage.png 1988;118;Big Blue, The;Drama;Barr, Jean-Marc;Arquette, Rosanna;Besson, Luc;7;No;NicholasCage.png 1991;115;Flight of the Intruder;Drama;Glover, Danny;Arquette, Rosanna;Milius, John;51;No;NicholasCage.png 1986;108;Nobody's Fool;Comedy;Roberts, Eric;Arquette, Rosanna;Purcell, Evelyn;52;No;NicholasCage.png 1985;97;After Hours;Comedy;Dunne, Griffin;Arquette, Rosanna;Scorsese, Martin;81;No;NicholasCage.png 1985;104;Desperately Seeking Susan;Comedy;Quinn, Aidan;Arquette, Rosanna;Seidelman, Susan;41;No;NicholasCage.png 1971;102;A New Leaf;Comedy;Matthau, Walter;Arrick, Rose;May, Elaine;83;No;NicholasCage.png 1959;91;Killers of Kilimanjaro;Action;Taylor, Robert;Aslan, Gregoire;Thorpe, Richard;11;No;NicholasCage.png 1926;126;Don Juan;Action;Barrymore, John;Astor, Mary;Crosland, Alan;55;No;NicholasCage.png 1987;102;Babette's Feast;Drama;LaFont, Jean-Philippe;Audran, Stéphane;Axel, Gabriel;79;Yes;NicholasCage.png 1989;118;Vincent, Francois, Paul & the Others;Drama;Montand, Yves;Audran, Stéphane;;20;No;NicholasCage.png 1988;141;Thunderball;Action;Connery, Sean;Auger, Claudine;Young, Terrence;8;No;seanConnery.png 1926;66;Lodger (Story of the London Fog);Mystery;Chesney, Arthur;Ault, Marie;Hitchcock, Alfred;76;No;alfredHitchcock.png 1988;103;Appointment with Death;Mystery;Ustinov, Peter;Bacall, Lauren;Donaggio, Michael Winner;75;No;NicholasCage.png 1974;128;Murder on the Orient Express;Mystery;Balsam, Martin;Bacall, Lauren;Lumet, Sidney;8;Yes;NicholasCage.png 1955;115;Blood Alley;War;Wayne, John;Bacall, Lauren;Wellman, William;15;No;johnWayne.png 1977;136;Spy Who Loved Me, The;Action;Moore, Roger;Bach, Barbara;Gilbert, Lewis;27;No;NicholasCage.png 1988;100;Storm;Action;Palfy, David;Bahtia, Stacy Christensen;Winning, David;61;No;NicholasCage.png 1991;89;Bloodbath;Horror;Hopper, Dennis;Baker, Carroll;;37;No;NicholasCage.png 1989;103;Miami Cops;Action;Roundtree, Richard;Baker, Dawn;Bradley, Al;40;No;NicholasCage.png 1996;96;Island of Dr. Moreau, The;Horror;Thewlis, David;Balk, Fairuza;Frankenheimer, John;39;No;NicholasCage.png 1992;100;Eighty-Four Charing Cross Road;Drama;Hopkins, Anthony;Bancroft, Anne;Jones, David;9;No;AnthonyHopkins.png 1980;124;Elephant Man, The;Drama;Hopkins, Anthony;Bancroft, Anne;Lynch, David;3;Yes;AnthonyHopkins.png 1988;90;Dr Alien;Science Fiction;Jacoby, Billy;Barash, Olivia;DeCoteau, David;70;No;NicholasCage.png 1982;120;Creepshow;Horror;Holbrook, Hal;Barbeau, Adrienne;Romero, George A.;70;No;NicholasCage.png 1987;100;Sammy & Rosie Get Laid;Drama;Din, Ayub Khan;Barber, Frances;Frears, Stephen;6;No;NicholasCage.png 1971;101;Goalie's Anxiety at the Penalty Kick, The;Drama;Brauss, Arthur;Bardischewski, Maria;Wenders, Wim;62;No;NicholasCage.png 1957;99;Mademoiselle Striptease;Comedy;Gelin, Daniel;Bardot, Brigitte;Allegret, Marc;25;No;brigitteBardot.png 1969;86;Women, The;Drama;Ronet, Maurice;Bardot, Brigitte;Aurel, Jean;66;No;brigitteBardot.png 1958;77;That Naughty Girl;Comedy;Bretonniere, Jean;Bardot, Brigitte;Boisrond, Michel;37;No;brigitteBardot.png 1959;90;Voulez-Vous Danser Avec Moi?;Comedy;Vidal, Henri;Bardot, Brigitte;Boisrond, Michel;16;No;brigitteBardot.png 1967;100;A Coeur Joie, (Head Over Heels);Action;Terzieff, Laurent;Bardot, Brigitte;Bourguignon, Serge;54;No;brigitteBardot.png 1968;113;Shalako;Westerns;Connery, Sean;Bardot, Brigitte;Dmytryk, Edward;0;No;brigitteBardot.png 1964;102;Contempt;Drama;Palance, Jack;Bardot, Brigitte;Godard, Jean-Luc;81;No;brigitteBardot.png 1965;100;Dear Brigitte;Comedy;Mumy, Billy;Bardot, Brigitte;Koster, Henry;71;No;brigitteBardot.png 1962;134;A Very Private Affair;Drama;Mastroianni, Marcello;Bardot, Brigitte;Malle, Louis;30;No;brigitteBardot.png 1964;99;Ravishing Idiot, The;Comedy;Perkins, Anthony;Bardot, Brigitte;Molinaro, Edouard;34;No;brigitteBardot.png 1958;90;Bride Is Much Too Beautiful, The;Comedy;Jourdan, Louis;Bardot, Brigitte;Surin, Fred;70;No;brigitteBardot.png 1955;90;Doctor at Sea;Comedy;Bogarde, Dirk;Bardot, Brigitte;Thomas, Ralph;83;No;brigitteBardot.png 1962;100;Le Repos Du Guerrier, (Warrior's Rest);War;Hossein, Robert;Bardot, Brigitte;Vadim, Roger;8;No;brigitteBardot.png 1957;90;And God Created Woman;Drama;Jurgens, Curt;Bardot, Brigitte;Vadim, Roger;29;No;brigitteBardot.png 1973;87;Ms. Don Juan;Drama;Ronet, Maurice;Bardot, Brigitte;Vadim, Roger;39;No;brigitteBardot.png 1987;97;Siesta;Drama;Byrne, Gabriel;Barkin, Ellen;Lambert, Mary;48;No;NicholasCage.png 1932;92;Rich & Strange;Drama;Kendall, Henry;Barry, Joan;Hitchcock, Alfred;57;No;alfredHitchcock.png 1987;104;Lionheart;Action;Stoltz, Eric;Barrymore, Deborah;Schaffner, Franklin J.;9;No;NicholasCage.png 1982;115;E. T. The Extra-Terrestrial;Science Fiction;Wallace, Dee;Barrymore, Drew;Spielberg, Steven;8;Yes;NicholasCage.png 1992;101;Cool World;Drama;Byrne, Gabriel;Basinger, Kim;Bakshi, Ralph;44;No;NicholasCage.png 1988;83;Nadine;Comedy;Bridges, Jeff;Basinger, Kim;Benton, Robert;47;No;NicholasCage.png 1989;126;Batman;Action;Nicholson, Jack;Basinger, Kim;Burton, Tim;14;No;JackNicholson.png 1987;95;Blind Date;Comedy;Willis, Bruce;Basinger, Kim;Edwards, Blake;7;No;NicholasCage.png 1982;101;Mother Lode;Action;Heston, Charlton;Basinger, Kim;Heston, Charlton;40;No;NicholasCage.png 1992;125;Final Analysis;Drama;Gere, Richard;Basinger, Kim;Joanou, Phil;50;No;NicholasCage.png 1983;134;Never Say Never Again;Action;Connery, Sean;Basinger, Kim;Kershner, Irvin;8;No;seanConnery.png 1986;117;Nine & a Half Weeks;Drama;Rourke, Mickey;Basinger, Kim;Lyne, Adrian;7;No;NicholasCage.png 1989;;Killjoy;Mystery;Culp, Robert;Basinger, Kim;Moxey, John Llewellyn;71;No;NicholasCage.png 1986;108;No Mercy;Drama;Gere, Richard;Basinger, Kim;Pearce, Richard;11;No;NicholasCage.png 1991;116;Marrying Man, The;Comedy;Baldwin, Alec;Basinger, Kim;Rees, Jerry;84;No;NicholasCage.png 1990;123;Misery;Horror;Caan, James;Bates, Kathy;Reiner, Rob;48;Yes;NicholasCage.png 1946;93;Crisis;Drama;Andersson, Wiktor;Baude, Anna-Lisa;Bergman, Ingmar;66;No;Bergman.png 1984;95;Samson & Delilah;Drama;Hamilton, Antony;Bauer, Belinda;Philips, Lee;36;No;NicholasCage.png 1990;101;Act of Piracy;Mystery;Busey, Gary;Bauer, Belinda;;74;No;NicholasCage.png 1988;96;Split Decisions;Drama;Hackman, Gene;Beals, Jennifer;Drury, David;52;No;NicholasCage.png 1989;103;Vampire's Kiss;Comedy;Cage, Nicolas;Beals, Jennifer;;49;No;NicholasCage.png 1988;96;Nightmare at Noon;Action;Hauser, Wings;Beck, Kimberly;Mastorakis, Nico;0;No;NicholasCage.png 1990;127;Presumed Innocent;Mystery;Ford, Harrison;Bedelia, Bonnie;Pakula, Alan J.;69;No;NicholasCage.png 1942;123;Reap the Wild Wind;Drama;Wayne, John;Beecher, Janet;DeMille, Cecil B.;59;No;johnWayne.png 1972;100;Pocket Money;Comedy;Newman, Paul;Belford, Christine;Rosenberg, Stuart;55;No;paulNewman.png 1977;102;Mary White;Drama;Flanders, Ed;Beller, Kathleen;Taylor, Jud;2;No;NicholasCage.png 1982;;Catch a Rising Star, Tenth Anniversary;Comedy;Belzer, Richard;Benatar, Pat;;18;No;NicholasCage.png 1990;105;Guilty by Suspicion;Drama;De Niro, Robert;Bening, Annette;Winkler, Irwin;88;No;NicholasCage.png 1948;99;Secret Beyond the Door;Mystery;Redgrave, Michael;Bennett, Joan;Lang, Fritz;31;No;NicholasCage.png 1945;103;Scarlet Street;Drama;Robinson, Edward G.;Bennett, Joan;Lang, Fritz;80;No;NicholasCage.png 1988;76;Daffy Duck's Quackbusters;Action;Blanc, Mel;Bennett, Julie;Ford, Greg;68;No;NicholasCage.png 1985;55;Rowlf's Rhapsodies with the Muppets;Comedy;Burns, George;Berenson, Marisa;;79;No;NicholasCage.png 1982;188;Gandhi;Drama;Kingsley, Ben;Bergen, Candice;Attenborough, Richard;7;Yes;NicholasCage.png 1975;120;Wind & the Lion, The;Action;Connery, Sean;Bergen, Candice;Milius, John;2;No;seanConnery.png 1971;96;Carnal Knowledge;Drama;Nicholson, Jack;Bergen, Candice;Nichols, Mike;10;No;JackNicholson.png 1970;126;Getting Straight;Comedy;Gould, Elliott;Bergen, Candice;Rush, Richard;83;No;NicholasCage.png 1972;90;Scarlet Letter, The;Drama;Albaicín, Rafael;Berger, Senta;Wenders, Wim;55;No;NicholasCage.png 1935;75;Count of Old Town, The;Comedy;Adolphson, Edvin;Bergman, Ingrid;Adolphson, Edvin;72;No;ingridBergman.png 1978;97;Autumn Sonata;Drama;Björk, Halvar;Bergman, Ingrid;Bergman, Ingmar;49;Yes;ingridBergman.png 1944;114;Gaslight;Drama;Boyer, Charles;Bergman, Ingrid;Cukor, George;25;Yes;ingridBergman.png 1958;100;Indiscreet;Drama;Grant, Cary;Bergman, Ingrid;Donen, Stanley;1;No;ingridBergman.png 1941;75;Walpurgis Night;Drama;Sjöström, Victor;Bergman, Ingrid;Edgren, Gustaf;32;No;ingridBergman.png 1948;100;Joan of Arc;Drama;Ferrer, Jose;Bergman, Ingrid;Fleming, Victor;7;No;ingridBergman.png 1982;195;A Woman Called Golda;Drama;Beatty, Ned;Bergman, Ingrid;Gibson, Alan;15;Yes;ingridBergman.png 1969;98;A Walk in the Spring Rain;Drama;Quinn, Anthony;Bergman, Ingrid;Green, Guy;2;No;ingridBergman.png 1949;117;Under Capricorn;Drama;Cotten, Joseph;Bergman, Ingrid;Hitchcock, Alfred;74;No;ingridBergman.png 1946;101;Notorious;Mystery;Grant, Cary;Bergman, Ingrid;Hitchcock, Alfred;42;No;ingridBergman.png 1940;90;June Night;Drama;Widgren, Olof;Bergman, Ingrid;Lindberg, Per;14;No;ingridBergman.png 1961;120;Goodbye Again;Drama;Perkins, Anthony;Bergman, Ingrid;Litvak, Anatole;6;No;ingridBergman.png 1956;106;Anastasia;Drama;Tamiroff, Akim;Bergman, Ingrid;Litvak, Anatole;24;Yes;ingridBergman.png 1945;126;Bells of St. Mary's, The;Drama;Crosby, Bing;Bergman, Ingrid;McCarey, Leo;31;No;ingridBergman.png 1937;91;Intermezzo;Drama;Ekman, Gösta;Bergman, Ingrid;Molander, Gustaf;32;No;ingridBergman.png 1938;104;A Woman's Face;Drama;Svennberg, Tore;Bergman, Ingrid;Molander, Gustaf;49;No;ingridBergman.png 1935;90;Swedenhielms;Drama;Westergren, Håkan;Bergman, Ingrid;Molander, Gustaf;88;No;ingridBergman.png 1939;87;Only One Night;Drama;Adolphson, Edvin;Bergman, Ingrid;Molander, Gustav;26;No;ingridBergman.png 1938;74;Dollar;Drama;Rydeberg, Georg;Bergman, Ingrid;Molander, Gustav;19;No;ingridBergman.png 1956;98;Elena & Her Men;Drama;Ferrer, Mel;Bergman, Ingrid;Renoir, Jean;33;No;ingridBergman.png 1952;110;Europa Fifty-One;Drama;Knox, Alexander;Bergman, Ingrid;Rossellini, Roberto;34;No;ingridBergman.png 1953;83;Voyage in Italy;Drama;Sanders, George;Bergman, Ingrid;Rossellini, Roberto;57;No;ingridBergman.png 1954;81;Fear;Drama;Wieman, Mathias;Bergman, Ingrid;Rossellini, Roberto;69;No;ingridBergman.png 1950;107;Stromboli;Drama;Vitale, Mario;Bergman, Ingrid;Rossellini, Roberto;69;No;ingridBergman.png 1969;103;Cactus Flower;Comedy;Matthau, Walter;Bergman, Ingrid;Saks, Gene;67;Yes;ingridBergman.png 1989;105;Hideaways;Comedy;Conover, Bruce;Bergman, Ingrid;;16;No;ingridBergman.png 1990;90;Twenty Four Hours in a Woman's Life;Drama;Torn, Rip;Bergman, Ingrid;;16;No;ingridBergman.png 1987;91;Programmed to Kill;Action;Ginty, Robert;Bergman, Sandahl;Holzman, Allan;71;No;NicholasCage.png 1982;128;Conan the Barbarian;Action;Schwarzenegger, Arnold;Bergman, Sandahl;Milius, John;45;No;NicholasCage.png 1991;91;Raw Nerve;Mystery;Ford, Glenn;Bergman, Sandahl;Prior, David A.;88;No;glennFord.png 1970;94;Think Dirty;Comedy;Feldman, Marty;Berman, Shelley;Clark, Jim;31;No;NicholasCage.png 1982;108;King of Comedy;Drama;De Niro, Robert;Bernhard, Sandra;Scorsese, Martin;84;No;NicholasCage.png 1983;60;Best of the Big Laff Off, The;Comedy;Murphy, Eddie;Bernhard, Sandra;;20;No;NicholasCage.png 1984;158;Amadeus;Drama;Abraham, F. Murray;Berridge, Elizabeth;Forman, Milos;6;Yes;NicholasCage.png 1973;101;White Lightning;Action;Reynolds, Burt;Billingsley, Jennifer;Sargent, Joseph;54;No;NicholasCage.png 1988;172;Unbearable Lightness of Being, The;Drama;Day-Lewis, Daniel;Binoche, Juliette;Kaufman, Philip;5;Yes;NicholasCage.png 1972;124;Life & Times of Judge Roy Bean, The;Western;Newman, Paul;Bisset, Jacqueline;Huston, John;65;No;paulNewman.png 1970;137;Airport;Drama;Lancaster, Burt;Bisset, Jacqueline;Seaton, George;0;Yes;burtLancaster.png 1973;116;Day for Night;Drama;Aumont, Jean-Pierre;Bisset, Jacqueline;Truffaut, François;10;Yes;NicholasCage.png 1952;107;Secrets of Women;Comedy;Malmsten, Birger;Björk, Anita;Bergman, Ingmar;66;No;Bergman.png 1976;116;Burnt Offerings;Horror;Reed, Oliver;Black, Karen;Curtis, Dan;35;No;NicholasCage.png 1969;94;Easy Rider;Drama;Fonda, Peter;Black, Karen;Hopper, Dennis;36;No;NicholasCage.png 1991;98;Five Easy Pieces;Drama;Nicholson, Jack;Black, Karen;Rafelson, Bob;2;No;JackNicholson.png 1974;144;Day of the Locust, The;Drama;Sutherland, Donald;Black, Karen;Schlesinger, John;81;No;NicholasCage.png 1964;112;Goldfinger;Action;Connery, Sean;Blackman, Honor;Hamilton, Guy;77;No;seanConnery.png 1977;117;Exorcist II, The Heretic;Horror;Burton, Richard;Blair, Linda;Boorman, John;29;No;NicholasCage.png 1953;61;White Lightning;;Clements, Stanley;Blondell, Gloria;Bernds, Edward;;No;NicholasCage.png 1942;88;Lady for a Night;Drama;Wayne, John;Blondell, Joan;Leigh, Jason;12;No;johnWayne.png 1968;103;Charly;Drama;Robertson, Cliff;Bloom, Claire;Nelson, Ralph;38;Yes;NicholasCage.png 1973;105;High Plains Drifter;Western;Eastwood, Clint;Bloom, Verna;Eastwood, Clint;57;No;clintEastwood.png 1982;123;Honkytonk Man;Drama;Eastwood, Clint;Bloom, Verna;Eastwood, Clint;69;No;clintEastwood.png 1990;102;Nightbreed;Horror;Cronenberg, David;Bobby, Anne;Barker, Clive;72;No;NicholasCage.png 1987;98;Under the Sun of Satan;Drama;Depardieu, Gérard;Bonnaire, Sandrine;Pialat, Maurice;45;No;NicholasCage.png 1985;105;Vagabond;Drama;Meril, Macha;Bonnaire, Sandrine;Varda, Agnes;49;No;NicholasCage.png 1993;60;Bill Cosby, Live at Harrah's;Comedy;Cosby, Bill;Boosler, Elayne;;13;No;NicholasCage.png 1974;89;Monty Python & the Holy Grail;Comedy;Chapman, Graham;Booth, Connie;Gilliam, Terry;83;No;NicholasCage.png 1993;65;John Cleese on How to Irritate People;Comedy;Cleese, John;Booth, Connie;;62;No;NicholasCage.png 1958;101;Matchmaker, The;Comedy;Perkins, Anthony;Booth, Shirley;Anthony, Joseph;67;No;NicholasCage.png 1981;129;For Your Eyes Only;Action;Moore, Roger;Bouquet, Carole;Glen, John;86;No;NicholasCage.png 1928;139;Wings;War;Rogers, Buddy;Bow, Clara;Wellman, William;44;Yes;NicholasCage.png 1992;106;Medicine Man;Action;Connery, Sean;Bracco, Lorraine;McTiernan, John;6;No;seanConnery.png 1989;;Good Fellas;Drama;De Niro, Robert;Bracco, Lorraine;Scorsese, Martin;15;No;NicholasCage.png 1985;119;Kiss of the Spider Woman;Drama;Hurt, William;Braga, Sonia;Babenco, Hector;10;Yes;NicholasCage.png 1990;121;Rookie, THe;Action;Eastwood, Clint;Braga, Sonia;Eastwood, Clint;48;No;clintEastwood.png 1973;129;Sting, The;Drama;Newman, Paul;Brennan, Eileen;Hill, George Roy;83;Yes;paulNewman.png 1958;96;Torpedo Run;War;Ford, Glenn;Brewster, Diane;Pevney, Joseph;50;No;glennFord.png 1986;101;Instant Justice;Drama;Paré, Michael;Bridges, Lynda;Rumar, Craig;45;No;NicholasCage.png 1990;135;Cyrano de Bergerac;Drama;Depardieu, Gérard;Brochet, Anne;Rappeneau, Jean-Paul;76;No;NicholasCage.png 1948;110;Border Street;Drama;Fijewski, Tadeusz;Broniewska, Maria;Ford, Aleksander;73;No;NicholasCage.png 1987;91;Firehouse;Comedy;Hopkins, Barrett;Brown, Violet;Ingvordsen, J. Christian;66;No;NicholasCage.png 1965;123;Morituri;Drama;Brando, Marlon;Brynner, Yul;Wicki, Bernhard;9;No;brando.png 1980;104;From the Life of the Marionettes;Drama;Atzorn, Robert;Buchegger, Christine;Bergman, Ingmar;58;No;Bergman.png 1988;120;Frantic;Mystery;Ford, Harrison;Buckley, Betty;Polanski, Roman;17;No;NicholasCage.png 1978;114;Coma;Science Fiction;Douglas, Michael;Bujold, Geneviève;Crichton, Michael;64;No;NicholasCage.png 1988;117;Dead Ringers;Drama;Irons, Jeremy;Bujold, Geneviève;Cronenberg, David;29;No;NicholasCage.png 1988;90;Golden Ninja Invasion;Action;West, Leonard;Burd, Stephanie;Lambert, Bruce;13;No;NicholasCage.png 1973;122;Exorcist, The;Horror;Sydow, Max von;Burstyn, Ellen;Friedkin, William;28;Yes;NicholasCage.png 1975;112;Alice Doesn't Live Here Anymore;Comedy;Kristofferson, Kris;Burstyn, Ellen;;82;Yes;NicholasCage.png 1982;94;Eyes of the Amaryllis, The;Drama;Bolt, Jonathan;Byrne, Martha;King Keller, Frederick;70;No;NicholasCage.png 1952;109;What Price Glory?;War;Cagney, James;Calvet, Corinne;Ford, John;4;No;johnFord.png 1954;40;Inauguration of the Pleasure Dome;Short;De Brier, Sampson;Cameron, Marjorie;Anger, Kenneth;62;No;NicholasCage.png 1989;114;School Daze;Comedy;Fishburne, Larry;Campbell, Tisha;Lee, Spike;18;No;NicholasCage.png 1990;102;End of Innocence, The;Drama;Heard, John;Cannon, Dyan;Cannon, Dyan;6;No;NicholasCage.png 1971;98;Anderson Tapes, The;Mystery;Connery, Sean;Cannon, Dyan;Lumet, Sidney;1;No;seanConnery.png 1983;50;Father Murphy, A Horse from Heaven;Comedy;Olsen, Merlin;Cannon, Katharine;Claxton, William F.;28;No;NicholasCage.png 1989;80;Skull;Drama;Bideman, Robert;Capone, Nadia;Bergman, Robert;19;No;NicholasCage.png 1987;91;Quick & The Dead, The;Western;Elliott, Sam;Capshaw, Kate;Day, Robert;40;No;NicholasCage.png 1984;94;Best Defense;Comedy;Moore, Dudley;Capshaw, Kate;Huyck, Willard;75;No;NicholasCage.png 1984;99;Dreamscape;Science Fiction;Quaid, Dennis;Capshaw, Kate;Ruben, Joseph;63;No;NicholasCage.png 1989;125;Black Rain;Action;Douglas, Michael;Capshaw, Kate;Scott, Ridley;73;No;NicholasCage.png 1963;138;8 1/2;Drama;Mastroianni, Marcello;Cardinale, Claudia;Fellini, Federico;80;Yes;NicholasCage.png 1935;64;One Frightened Night;Horror;Ford, Wallace;Carlisle, Mary;Cabanne, Christy;33;No;NicholasCage.png 1988;103;Year My Voice Broke, The;Drama;Taylor, Noah;Carmen, Loene;Duigan, John;71;No;NicholasCage.png 1966;175;Is Paris Burning?;War;Belmondo, Jean-Paul;Caron, Leslie;Clément, René;63;No;NicholasCage.png 1974;313;QB VII;Drama;Hopkins, Anthony;Caron, Leslie;Gries, Tom;28;Yes;AnthonyHopkins.png 1977;104;Island of Dr. Moreau, The;Horror;Lancaster, Burt;Carrera, Barbara;Taylor, Don;54;No;burtLancaster.png 1983;104;Beyond the Limit;Drama;Caine, Michael;Carrillo, Elpidia;Mackenzie, John;51;No;NicholasCage.png 1936;84;Secret Agent;Mystery;Lorre, Peter;Carroll, Madeleine;Hitchcock, Alfred;50;No;alfredHitchcock.png 1986;71;Paramount Comedy Theater: Well-Developed;Comedy;Mahler, Bruce;Carter, Judy;;40;No;NicholasCage.png 1972;71;Big Bust Out, The;Action;Kendall, Tony;Carter, Karen;Theumer, Ernst R. von;50;No;NicholasCage.png 1987;119;Fourth Protocol, The;Mystery;Caine, Michael;Cassidy, Joanna;Mackenzie, John;14;No;NicholasCage.png 1990;107;Gremlins 2: The New Batch;Comedy;Galligan, Zach;Cates, Phoebe;Dante, Joe;61;No;NicholasCage.png 1982;92;Fast Times at Ridgemont High;Comedy;Penn, Sean;Cates, Phoebe;Heckerling, Amy;65;No;NicholasCage.png 1987;;Mannequin;Comedy;McCarthy, Andrew;Cattrall, Kim;Gottlieb, Michael;23;No;NicholasCage.png 1977;91;Rabid;Horror;Moore, Frank;Chambers, Marilyn;Cronenberg, David;34;No;NicholasCage.png 1990;;Party, The;Comedy;Sellers, Peter;Champion, Marge;Edwards, Blake;32;No;NicholasCage.png 1989;90;Vampire Raiders, Ninja Queen;Action;Peterson, Chris;Chan, Agnes;Lambert, Bruce;15;No;NicholasCage.png 1970;26;Bloopers from Star Trek;Comedy;Lawford, Peter;Channing, Carol;;22;No;NicholasCage.png 1943;99;Destroyer;Action;Robinson, Edward G.;Chapman, Marguerite;Seiter, William A.;87;No;NicholasCage.png 1992;99;Party Girl;Comedy;Taylor, Robert;Charisse, Cyd;Ray, Nicholas;85;No;NicholasCage.png 1989;113;Twin Peaks;Mystery;MacLachlan, Kyle;Chen, Joan;Lynch, David;86;No;kyle.png 1987;103;Moonstruck;Comedy;Cage, Nicholas;Cher;Jewison, Norman;6;Yes;NicholasCage.png 1987;119;Witches of Eastwick, The;Comedy;Nicholson, Jack;Cher;Miller, George;8;No;NicholasCage.png 1979;128;Moonraker;Action;Moore, Roger;Chiles, Lois;Gilbert, Lewis;32;No;NicholasCage.png 1984;106;Beat Street;Drama;Davis, Guy;Chong, Rae Dawn;Lathan, Stan;72;No;NicholasCage.png 1986;88;Running Out of Luck;Comedy;Jagger, Mick;Chong, Rae Dawn;;16;No;NicholasCage.png 1989;90;Never on Tuesday;Drama;Lauer, Andrew;Christian, Claudia;Rifkin, Adam;77;No;NicholasCage.png 1975;109;Shampoo;Comedy;Beatty, Warren;Christie, Julie;Ashby, Hal;69;Yes;NicholasCage.png 1985;111;Power;Drama;Hackman, Gene;Christie, Julie;Lumet, Sidney;43;No;NicholasCage.png 1965;122;Darling;Drama;Harvey, Laurence;Christie, Julie;Schlesinger, John;44;Yes;NicholasCage.png 1963;120;Ugly American, The;Drama;Brando, Marlon;Church, Sandra;Englund, George;63;No;brando.png 1931;68;Ambassador Bill;Comedy;Rogers, Will;Churchill, Marguerite;Taylor, Sam;66;No;NicholasCage.png 1931;110;Big Trail, The;Western;Wayne, John;Churchill, Marguerite;Walsh, Raoul;22;No;johnWayne.png 1967;111;Hombre;Western;Newman, Paul;Cilento, Diane;Ritt, Martin;50;No;paulNewman.png 1968;103;Coogan's Bluff;Action;Eastwood, Clint;Clark, Susan;Siegel, Don;57;No;clintEastwood.png 1989;91;Penn & Teller Get Killed;Comedy;Penn, Jillette;Clarke, Caitlin;Penn, Arthur;12;No;NicholasCage.png 1987;118;Shy People;Drama;Philbin, John;Clayburgh, Jill;Konchalovsky, Andrei;7;No;NicholasCage.png 1980;91;It's My Turn;Comedy;Douglas, Michael;Clayburgh, Jill;Weill, Claudia;0;No;NicholasCage.png 1988;119;Dangerous Liaisons;Drama;Malkovich, John;Close, Glenn;Frears, Stephen;77;No;MichellePfeiffer.png 1990;111;Reversal of Fortune;Drama;Irons, Jeremy;Close, Glenn;Schroeder, Barbet;73;Yes;NicholasCage.png 1991;119;Meeting Venus;Comedy;Arestrup, Niels;Close, Glenn;Szabó, István;74;No;NicholasCage.png 1946;105;Tomorrow Is Forever;Drama;Welles, Orson;Colbert, Claudette;;65;No;NicholasCage.png 1987;101;Like Father Like Son;Comedy;Cameron, Kirk;Colin, Margaret;Daniel, Rod;20;No;NicholasCage.png 1948;81;Rope;Drama;Stewart, James;Collier, Constance;Hitchcock, Alfred;39;No;alfredHitchcock.png 1962;91;Road to Hong Kong;Comedy;Hope, Bob;Collins, Joan;Panama, Norman;37;No;NicholasCage.png 1989;108;Shirley Valentine;Comedy;Conti, Tom;Collins, Pauline;Gilbert, Lewis;51;No;NicholasCage.png 1992;135;City of Joy;Drama;Swayze, Patrick;Collins, Pauline;Joffe, Roland;87;No;NicholasCage.png 1966;99;Appaloosa, The;Western;Brando, Marlon;Comer, Anjanette;Furie, Sidney J.;15;No;brando.png 1986;88;Seven Minutes in Heaven;Comedy;Thames, Byron;Connelly, Jennifer;Feferman, Linda;49;No;NicholasCage.png 1991;96;Hearts of Darkness, A Filmmaker's Apocalypse;Drama;Bottoms, Sam;Coppola, Eleanor;Bahr, Fax;72;No;NicholasCage.png 1961;66;Tonight for Sure;Comedy;Lee, Karla;Cornell, Laura;Coppola, Francis Ford;4;No;NicholasCage.png 1990;110;White Hunter, Black Heart;Adventure;Eastwood, Clint;Cornwell, Charlotte;Eastwood, Clint;66;No;clintEastwood.png 1962;110;Sundays & Cybele;Drama;Kruger, Hardy;Courcel, Nicole;Bourguignon, Serge;11;Yes;NicholasCage.png 1989;90;Puppet Master;Science Fiction;LeMat, Paul;Crampton, Barbara;Schmoeller, David;20;No;NicholasCage.png 1991;95;Night Gallery;Horror;McDowall, Roddy;Crawford, Joan;Spielberg, Steven;31;No;NicholasCage.png 1989;103;Pet Sematary;Horror;Gwynne, Fred;Crosby, Denise;Lambert, Mary;27;No;NicholasCage.png 1992;60;America's Music, Gospel;Music;Phipps, Wentley;Crouch, Sandra;Walton, Kip;13;No;NicholasCage.png 1977;123;Slap Shot;Comedy;Newman, Paul;Crouse, Lindsay;Hill, George Roy;82;No;paulNewman.png 1987;109;O. C. & Stiggs;Comedy;Jenkins, Daniel H.;Curtin, Jane;Altman, Robert;3;No;NicholasCage.png 1988;108;A Fish Called Wanda;Comedy;Cleese, John;Curtis, Jamie Lee;Crichton, Charles;7;Yes;NicholasCage.png 1954;96;A Lesson in Love;Comedy;Björnstrand, Gunnar;Dahlbeck, Eva;Bergman, Ingmar;48;No;Bergman.png 1957;82;Brink of Life;Drama;Josephson, Erland;Dahlbeck, Eva;Bergman, Ingmar;57;No;Bergman.png 1986;120;Betty Blue;Drama;Anglade, Jean-Hughes;Dalle, Béatrice;Beineix, Jean-Jacques;71;No;NicholasCage.png 1979;122;Hair;Music;Savage, John;D'Angelo, Beverly;Forman, Milos;67;No;NicholasCage.png 1989;97;National Lampoon's Christmas Vacation;Comedy;Chase, Chevy;D'Angelo, Beverly;S, Jeremiah;81;No;NicholasCage.png 1974;124;Dersu Uzala, (The Hunter);Adventure;Solomin, Yuri;Danilchenko, Svetlana;Kurosawa, Akira;81;Yes;NicholasCage.png 1990;106;Alice;Comedy;Baldwin, Alec;Danner, Blythe;Allen, Woody;22;No;woody.png 1980;90;Fifth Floor, The;Mystery;Hopkins, Bo;D'Arbanville, Patti;Avedis, Howard Hikmet;74;No;NicholasCage.png 1990;94;Snow Kill;Drama;Knox, Terence;D'Arbanville, Patti;Wright, Thomas J.;35;No;NicholasCage.png 1971;74;People, The;Drama;Shatner, William;Darby, Kim;Coppola, Francis Ford;36;No;NicholasCage.png 1969;128;True Grit;Western;Wayne, John;Darby, Kim;Hathaway, Henry;77;Yes;johnWayne.png 1942;18;Battle of Midway, The;War;Crisp, Donald;Darwell, Jane;Ford, John;75;No;johnFord.png 1948;103;Three Godfathers;Western;Wayne, John;Darwell, Jane;Ford, John;72;No;johnWayne.png 1965;133;Hush, Hush, Sweet Charlotte;Mystery;Cotten, Joseph;Davis, Bette;Aldrich, Robert;68;No;NicholasCage.png 1946;110;A Stolen Life;Drama;Ford, Glenn;Davis, Bette;Bernhardt, Curtis;20;No;glennFord.png 1939;96;Old Maid, The;Drama;Brent, George;Davis, Bette;Goulding, Edmund;18;No;NicholasCage.png 1950;138;All about Eve;Drama;Sanders, George;Davis, Bette;Mankiewicz, Joseph L.;23;Yes;NicholasCage.png 1986;96;Fly, The;Horror;Goldblum, Jeff;Davis, Geena;Cronenberg, David;33;No;NicholasCage.png 1990;89;Quick Change;Comedy;Murray, Bill;Davis, Geena;Franklin, Howard ;24;No;NicholasCage.png 1988;93;Lair of the White Worm, The;Horror;Grant, Hugh;Davis, Sammi;Russell, Ken;16;No;NicholasCage.png 1989;104;Rainbow, The;Drama;Hemmings, David;Davis, Sammi;Russell, Ken;53;No;NicholasCage.png 1956;120;Man Who Knew Too Much, The;Mystery;Stewart, James;Day, Doris;Hitchcock, Alfred;15;No;alfredHitchcock.png 1992;90;Beauty & the Beast;Science Fiction;Marais, Jean;Day, Josette;Cocteau, Jean;14;No;NicholasCage.png 1940;120;Foreign Correspondent;Mystery;McCrea, Joel;Day, Laraine;Hitchcock, Alfred;61;No;alfredHitchcock.png 1949;115;Heiress, The;Drama;Richardson, Ralph;De Havilland, Olivia;Wyler, William;81;Yes;NicholasCage.png 1986;120;Boy Who Could Fly, The;Drama;Underwood, Jay;Deakins, Lucy;Castle, Nick;25;No;NicholasCage.png 1975;89;Terrorists, The;Action;Connery, Sean;Dean, Isabel;Wrede, Caspar;4;No;seanConnery.png 1942;85;Wheel of Fortune;Drama;Wayne, John;Dee, Frances;Auer, John H.;36;No;johnWayne.png 1989;120;Do the Right Thing;Drama;Aiello, Danny;Dee, Ruby;Lee, Spike;5;No;NicholasCage.png 1990;93;Court-Martial of Jackie Robinson, The;Drama;Braugher, Andre;Dee, Ruby;Peerce, Larry;33;No;NicholasCage.png 1967;90;Elvira Madigan;Drama;Berggren, Thommy;Degermark, Pia;Widerberg, Bo;28;No;NicholasCage.png 1992;86;Hurricane Smith;Action;Weathers, Carl;Delaney, Cassandra;Budds, Colin;16;No;NicholasCage.png 1987;86;Fair Game;Action;Ford, Peter;Delaney, Cassandra;;24;No;NicholasCage.png 1989;95;Rape of the Sabines, The;Action;Moore, Roger;Demongeot, Mylene;;83;No;NicholasCage.png 1983;99;Risky Business;Comedy;Cruise, Tom;DeMornay, Rebecca;Brickman, Paul;28;No;NicholasCage.png 1980;103;I Love All of You (Je Vous Aime);Drama;Depardieu, Gérard;Deneuve, Catherine;Berri, Claude;40;No;NicholasCage.png 1986;108;Love Songs;Drama;Lambert, Christopher;Deneuve, Catherine;Chouraqui, Elie;15;No;NicholasCage.png 1983;114;Le Choix des Armes;Mystery;Montand, Yves;Deneuve, Catherine;Comeau, Alain;15;No;NicholasCage.png 1981;135;Choice of Arms;Action;Montand, Yves;Deneuve, Catherine;Corneau, Alan;87;No;NicholasCage.png 1977;107;March or Die;War;Hackman, Gene;Deneuve, Catherine;Richards, Dick;59;No;NicholasCage.png 1980;135;Last Metro, The;Drama;Depardieu, Gérard;Deneuve, Catherine;Truffaut, François;66;No;NicholasCage.png 1986;120;Jean de Florette;Drama;Montand, Yves;Depardieu, Elizabeth;Berri, Claude;87;Yes;NicholasCage.png 1989;127;Fat Man & Little Boy;Drama;Newman, Paul;Dern, Laura;Joffe, Roland;86;No;paulNewman.png 1990;125;Wild at Heart;Drama;Cage, Nicolas;Dern, Laura;Lynch, David;6;No;NicholasCage.png 1989;113;Family Business;Action;Connery, Sean;DeSoto, Rosana;Lumet, Sidney;5;No;seanConnery.png 1988;103;Stand & Deliver;Drama;Olmos, Edward James;DeSoto, Rosana;Menendez, Ramon;19;No;NicholasCage.png 1981;94;Looker;Science Fiction;Finney, Albert;Dey, Susan;Crichton, Michael;62;No;NicholasCage.png 1989;89;Fire & Rain;Action;Haid, Charles;Dickinson, Angie;Jameson, Jerry;10;No;NicholasCage.png 1990;56;Best of Candid Camera, The;Comedy;Allen, Woody;Dickinson, Angie;;12;No;woody.png 1940;83;Seven Sinners;Drama;Wayne, John;Dietrich, Marlene;Garnett, Tay;24;No;johnWayne.png 1961;190;Judgment at Nuremberg;Drama;Tracy, Spencer;Dietrich, Marlene;Kramer, Stanley;39;Yes;spencerTracy.png 1989;60;Minsky's Follies;Comedy;Taylor, Rip;Diller, Phyllis;;12;No;NicholasCage.png 1990;97;Novice, The;Comedy;Sharif, Omar;Dombasle, Arielle;;72;No;NicholasCage.png 1987;130;Wings of Desire;Drama;Ganz, Bruno;Dommartin, Solveig;Wenders, Wim;71;No;NicholasCage.png 1991;158;Until the End of the World;Drama;Hurt, William;Dommartin, Solveig;Wenders, Wim;57;No;NicholasCage.png 1987;118;Castaway;Drama;Reed, Oliver;Donohoe, Amanda;Roeg, Nicolas;41;No;NicholasCage.png 1993;30;Alfred Hitchcock Presents, Sorcerer's Apprentice;Mystery;Hitchcock, Alfred;Dors, Diana;;60;No;NicholasCage.png 1991;99;Delicatessen;Comedy;Benezech, Pascal;Dougnac, Marie-Laure;Caro, Marc;78;No;NicholasCage.png 1979;110;Great Train Robbery, The;Mystery;Connery, Sean;Down, Lesley-Anne;Crichton, Michael;7;No;seanConnery.png 1991;110;Hanover Street;Drama;Ford, Harrison;Down, Lesley-Anne;Hyams, Peter;81;No;NicholasCage.png 1991;102;Hunchback;Drama;Hopkins, Anthony;Down, Lesley-Anne;Tuchner, Michael;33;No;AnthonyHopkins.png 1946;97;My Darling Clementine;Western;Fonda, Henry;Downs, Cathy;Ford, John;12;No;johnFord.png 1950;86;Wagon Master;Western;Johnson, Ben;Dru, Joanne;Ford, John;30;No;johnFord.png 1949;93;She Wore a Yellow Ribbon;Western;Wayne, John;Dru, Joanne;Ford, John;84;No;johnWayne.png 1985;90;Fantasy Man;Comedy;Hopkins, Harold;Drynan, Jeanie;Meagher, John;82;No;NicholasCage.png 1986;87;Monster in the Closet;Comedy;Grant, Donald;DuBarry, Denise;Dahlin, Bob;39;No;NicholasCage.png 1992;85;Double Edge;Drama;Eban, Abba;Dunaway, Faye;Kollek, Amos;69;No;clintEastwood.png 1976;116;Network;Comedy;Finch, Peter;Dunaway, Faye;Lumet, Sidney;48;Yes;NicholasCage.png 1974;131;Chinatown;Drama;Nicholson, Jack;Dunaway, Faye;Polanski, Roman;55;Yes;JackNicholson.png 1975;117;Three Days of the Condor;Drama;Redford, Robert;Dunaway, Faye;Pollack, Sydney;87;No;NicholasCage.png 1977;134;Voyage of the Damned;Drama;Sydow, Max von;Dunaway, Faye;Rosenberg, Stuart;34;No;NicholasCage.png 1987;97;Barfly;Drama;Rourke, Mickey;Dunaway, Faye;Schroeder, Barbet;23;No;NicholasCage.png 1990;104;Wait Until Spring, Bandini;Drama;Mantegna, Joe;Dunaway, Faye;;20;No;NicholasCage.png 1947;118;Life with Father;Comedy;Powell, William;Dunne, Irene;Curtiz, Michael;10;No;NicholasCage.png 1943;;A Guy Named Joe;Drama;Tracy, Spencer;Dunne, Irene;Fleming, Victor;42;No;spencerTracy.png 1974;117;Stavisky;Drama;Belmondo, Jean-Paul;Duperey, Anny;Resnais, Alain;1;No;NicholasCage.png 1981;117;Time Bandits;Comedy;Cleese, John;Duvall, Shelley;Gilliam, Terry;5;No;NicholasCage.png 1980;144;Shining, The;Horror;Nicholson, Jack;Duvall, Shelley;Kubrick, Stanley;32;No;JackNicholson.png 1945;91;Flame of Barbary Coast;Western;Wayne, John;Dvorak, Ann;Kane, Joseph;54;No;johnWayne.png 1993;92;Naked Truth, The;Comedy;Sellers, Peter;Eaton, Shirley;;34;No;NicholasCage.png 1979;92;Brood, The;Horror;Reed, Oliver;Eggar, Samantha;Cronenberg, David;51;No;NicholasCage.png 1970;123;Molly Maguires, The;Action;Connery, Sean;Eggar, Samantha;Ritt, Martin;3;No;seanConnery.png 1984;105;Beverly Hills Cop;Comedy;Murphy, Eddie;Eilbacher, Lisa;Brest, Martin;41;No;NicholasCage.png 1991;86;Blind Man's Bluff;Mystery;Urich, Robert;Eilbacher, Lisa;Quinn, James;64;No;NicholasCage.png 1961;140;La Dolce Vita;Drama;Mastroianni, Marcello;Ekberg, Anita;Fellini, Federico;20;No;NicholasCage.png 1966;103;After the Fox;Comedy;Sellers, Peter;Ekland, Britt;De Sica, Vittorio;60;No;NicholasCage.png 1974;127;Man with the Golden Gun, The;Action;Moore, Roger;Ekland, Britt;Hamilton, Guy;41;No;NicholasCage.png 1985;96;Marbella;Action;Taylor, Rod;Ekland, Britt;Hermoso, Miguel;45;No;NicholasCage.png 1967;103;Bobo, The;Comedy;Sellers, Peter;Ekland, Britt;Parrish, Robert;80;No;NicholasCage.png 1993;53;Big Bands, The;Music;Beneke, Tex;Elgart, Les;;48;No;NicholasCage.png 1992;97;Killer Image.;Mystery;Ironside, Michael;Errickson, Krista;Winning, David;8;No;NicholasCage.png 1987;94;Kandyland;Drama;Laulette, Charles;Evenson, Kim;Schnitzer, Robert Allen;41;No;NicholasCage.png 1987;94;Campus Man;Drama;Dye, John;Fairchild, Morgan;Casden, Ron;38;No;NicholasCage.png 1956;101;Jubal;Drama;Ford, Glenn;Farr, Felicia;Daves, Delmer;32;No;glennFord.png 1985;84;Purple Rose of Cairo, The;Comedy;Aiello, Danny;Farrow, Mia;Allen, Woody;20;Yes;woody.png 1984;85;Broadway Danny Rose;Comedy;Allen, Woody;Farrow, Mia;Allen, Woody;14;No;woody.png 1992;108;Husbands & Wives;Comedy;Allen, Woody;Farrow, Mia;Allen, Woody;80;No;woody.png 1986;103;Hannah & Her Sisters;Comedy;Caine, Michael;Farrow, Mia;Allen, Woody;8;Yes;woody.png 1979;115;Hurricane;Action;Robards, Jason;Farrow, Mia;Troell, Jan;8;No;NicholasCage.png 1986;95;Between Two Women;Drama;Nouri, Michael;Fawcett, Farrah;Avnet, John;52;No;NicholasCage.png 1981;96;Cannonball Run, The;Comedy;Reynolds, Burt;Fawcett, Farrah;Needham, Hal;80;No;NicholasCage.png 1936;70;Doughnuts & Society;Comedy;Nugent, Eddie;Fazenda, Louise;Collins, Lewis D.;28;No;NicholasCage.png 1978;450;Holocaust;Drama;Bottoms, Joseph;Feldshuh, Tovah;Chomsky, Marvin J.;1;No;NicholasCage.png 1990;103;Meridian;Science Fiction;Jamieson, Malcolm;Fenn, Sherilyn;Band, Charles;47;No;NicholasCage.png 1992;90;Diary of a Hitman;Drama;Whitaker, Forest;Fenn, Sherilyn;London, Roy;67;No;NicholasCage.png 1988;95;Gor;Action;Reed, Oliver;Ferratti, Rebecca;Kiersch, Fritz;2;No;NicholasCage.png 1987;95;Surrender;Comedy;Caine, Michael;Field, Sally;Belson, Jerry;84;No;NicholasCage.png 1984;112;Places in the Heart;Drama;Harris, Ed;Field, Sally;Benton, Robert;83;Yes;NicholasCage.png 1991;106;Not Without My Daughter;Drama;Molina, Alfred;Field, Sally;Gilbert, Brian;55;No;NicholasCage.png 1977;113;Heroes;Drama;Winkler, Henry;Field, Sally;Kagan, Jeremy Paul;17;No;NicholasCage.png 1981;116;Absence of Malice;Drama;Newman, Paul;Field, Sally;Pollack, Sydney;76;No;paulNewman.png 1979;110;Norma Rae;Drama;Bridges, Beau;Field, Sally;Ritt, Martin;64;Yes;NicholasCage.png 1989;118;Steel Magnolias;Drama;Skerritt, Tom;Field, Sally;Ross, Herbert;66;No;NicholasCage.png 1989;101;Burbs, The;Comedy;Hanks, Tom;Fisher, Carrie;Dante, Joe;42;No;NicholasCage.png 1980;124;Empire Strikes Back, The;Science Fiction;Hamill, Mark;Fisher, Carrie;Kershner, Irvin;33;No;NicholasCage.png 1977;121;Star Wars;Science Fiction;Hamill, Mark;Fisher, Carrie;Lucas, George;44;No;NicholasCage.png 1983;132;Return of the Jedi;Science Fiction;Hamill, Mark;Fisher, Carrie;Marquand, Richard;4;No;NicholasCage.png 1991;104;Hear My Song;Drama;Dunbar, Adrian;Fitzgerald, Tara;Chelsom, Peter;72;No;NicholasCage.png 1956;99;Slightly Scarlet;Action;Payne, John;Fleming, Rhonda;Dwan, Allan;52;No;NicholasCage.png 1957;120;Gunfight at the OK Corral;Western;Lancaster, Burt;Fleming, Rhonda;Sturges, John;84;No;burtLancaster.png 1931;;Range Feud, The;Western;Wayne, John;Fleming, Susan;Lederman, Ross;51;No;johnWayne.png 1990;89;Bloodsucking Pharaohs in Pittsburgh;Comedy;Dengel, Jake;Fletcher, Suzanne;Smithey, Alan;79;No;NicholasCage.png 1972;129;Roma;Drama;Gonzales, Peter;Florence, Fiona;Fellini, Federico;75;No;NicholasCage.png 1979;122;China Syndrome, The;Drama;Douglas, Michael;Fonda, Jane;Bridges, James;43;No;NicholasCage.png 1986;100;Morning After, The;Mystery;Bridges, Jeff;Fonda, Jane;Lumet, Sidney;6;No;NicholasCage.png 1971;114;Klute;Drama;Sutherland, Donald;Fonda, Jane;Pakula, Alan J.;15;Yes;NicholasCage.png 1979;113;Electric Horseman, The;Comedy;Redford, Robert;Fonda, Jane;Pollack, Sydney;34;No;NicholasCage.png 1965;97;Cat Ballou;Comedy;Marvin, Lee;Fonda, Jane;Silverstein, Elliot;62;Yes;NicholasCage.png 1991;;Coming Home;Drama;Voight, Jon;Fonda, Jane;;1;Yes;NicholasCage.png 1940;130;Rebecca;Drama;Olivier, Laurence;Fontaine, Joan;Hitchcock, Alfred;78;Yes;alfredHitchcock.png 1944;96;Jane Eyre;Drama;Welles, Orson;Fontaine, Joan;Stevenson, Robert;44;No;NicholasCage.png 1973;87;Stacey!;Action;Randall, Anne;Ford, Anitra;Sidaris, Andy;31;No;NicholasCage.png 1992;85;Naked Obsession;Mystery;Katt, William;Ford, Maria;Golden, Dan;26;No;NicholasCage.png 1989;83;Stripped to Kill II, Live Girls;Mystery;Lottimer, Ed;Ford, Maria;Ruben, Katt Shea;80;No;NicholasCage.png 1990;94;Rain Killer, The;Mystery;Sharkey, Ray;Ford, Maria;Stein, Ken;10;No;NicholasCage.png 1983;95;Valley Girl;Comedy;Cage, Nicolas;Foreman, Deborah;Coolidge, Martha;30;No;NicholasCage.png 1991;118;Silence of the Lambs, The;Mystery;Hopkins, Anthony;Foster, Jodie;Demme, Jonathan;8;Yes;AnthonyHopkins.png 1988;98;Stealing Home;Drama;Harmon, Mark;Foster, Jodie;Kampmann, Steven ;76;No;NicholasCage.png 1972;92;Napoleon & Samantha;Comedy;Douglas, Michael;Foster, Jodie;McEveety, Bernard;33;No;NicholasCage.png 1988;;Five Corners;Drama;Robbins, Tim;Foster, Jodie;;88;No;NicholasCage.png 1955;;Blackboard Jungle, The;Drama;Ford, Glenn;Francis, Anne;Brooks, Richard;66;No;glennFord.png 1989;103;My Left Foot;Drama;Day-Lewis, Daniel;Fricker, Brenda;Sheridan, Jim;32;Yes;NicholasCage.png 1987;92;Back to the Beach;Comedy;Avalon, Frankie;Funicello, Annette;Hobbs, Lyndall;45;No;NicholasCage.png 1934;85;Painted Veil, The;Drama;Marshall, Herbert;Garbo, Greta;Boleslawski, Richard;57;No;gretaGarbo.png 1931;74;Inspiration;Drama;Apfel, Oscar;Garbo, Greta;Brown, Clarence;66;No;gretaGarbo.png 1930;92;Anna Christie;Drama;Bickford, Charles;Garbo, Greta;Brown, Clarence;0;No;gretaGarbo.png 1926;109;Flesh & the Devil, The;Drama;Gilbert, John;Garbo, Greta;Brown, Clarence;72;No;gretaGarbo.png 1928;90;Woman of Affairs;Drama;Gilbert, John;Garbo, Greta;Brown, Clarence;83;No;gretaGarbo.png 1935;96;Anna Karenina;Drama;March, Fredric;Garbo, Greta;Brown, Clarence;35;Yes;gretaGarbo.png 1936;110;Camille;Drama;Taylor, Robert;Garbo, Greta;Cukor, George;74;No;gretaGarbo.png 1931;91;Mata Hari;Drama;Novarro, Ramon;Garbo, Greta;Fitzmaurice, George;67;No;gretaGarbo.png 1929;100;Wild Orchids;Drama;Stone, Lewis;Garbo, Greta;Franklin, Sidney;70;No;gretaGarbo.png 1932;112;Grand Hotel;Drama;Barrymore, John;Garbo, Greta;Goulding, Edmund;81;Yes;gretaGarbo.png 1931;84;Susan Lennox, Her Fall & Rise;Drama;Hale, Alan;Garbo, Greta;Leonard, Robert Z.;64;No;gretaGarbo.png 1939;108;Ninotchka;Comedy;Douglas, Melvyn;Garbo, Greta;Lubitsch, Ernst;40;No;gretaGarbo.png 1933;97;Queen Christina;Drama;Gilbert, John;Garbo, Greta;Mamoulian, Rouben;82;No;gretaGarbo.png 1928;96;Mysterious Lady, The;Drama;Nagel, Conrad;Garbo, Greta;Niblo, Fred;72;No;gretaGarbo.png 1925;125;Joyless Street;Drama;Stuart, Henry;Garbo, Greta;Pabst, Georg Wilhelm;73;No;gretaGarbo.png 1929;74;Single Standard, The;Drama;Asther, Nils;Garbo, Greta;Robertson, John S.;73;No;gretaGarbo.png 1932;71;As You Desire Me;Drama;Douglas, Melvyn;Garbo, Greta;;85;No;gretaGarbo.png 1930;76;Romance;Drama;Stone, Lewis;Garbo, Greta;;62;No;gretaGarbo.png 1962;105;A Child Is Waiting;Drama;Lancaster, Burt;Garland, Judy;Cassavetes, John;60;No;burtLancaster.png 1982;116;Tootsie;Comedy;Hoffman, Dustin;Garr, Teri;Pollack, Sydney;8;Yes;NicholasCage.png 1989;86;Let It Ride;Comedy;Dreyfuss, Richard;Garr, Teri;Pytka, Joe;88;No;NicholasCage.png 1953;120;Julius Caesar;Drama;Brando, Marlon;Garson, Greer;Mankiewicz, Joseph L.;50;No;brando.png 1979;120;Nineteen Forty-One;Comedy;Belushi, John;Gary, Lorraine;Spielberg, Steven;24;No;NicholasCage.png 1975;124;Jaws;Action;Scheider, Roy;Gary, Lorraine;Spielberg, Steven;6;No;NicholasCage.png 1987;93;Hot Pursuit;Drama;Cusack, John;Gazelle, Wendy;Lisberger, Steven;44;No;NicholasCage.png 1989;120;Triumph of the Spirit;Drama;Dafoe, Willem;Gazelle, Wendy;Young, Robert M.;49;No;NicholasCage.png 1975;111;Brannigan;Drama;Wayne, John;Geeson, Judy;Hickox, Douglas;64;No;johnWayne.png 1979;89;Buffet Froid;Comedy;Depardieu, Gérard;Gence, Denise;Blier, Bertrand;75;No;NicholasCage.png 1986;122;Salvador;Drama;Woods, James;Gibb, Cynthia;Stone, Oliver;77;No;NicholasCage.png 1959;102;Horse Soldiers, The;Western;Wayne, John;Gibson, Althea;Ford, John;76;No;johnWayne.png 1954;108;Long John Silver;Action;Newton, Robert;Gilchrist, Connie;Haskin, Byron;56;No;NicholasCage.png 1961;134;Hustler, The;Drama;Newman, Paul;Gleason, Jackie;Rossen, Robert;43;Yes;paulNewman.png 1983;109;Star Chamber, The;Drama;Douglas, Michael;Gless, Sharon;Hyam, Peter;3;No;NicholasCage.png 1988;100;Clara's Heart;Drama;Ontkean, Michael;Goldberg, Whoopi;Mulligan, Robert;60;No;NicholasCage.png 1987;102;Burglar;Comedy;Goldthwait, Bob;Goldberg, Whoopi;Wilson, Hugh;44;No;NicholasCage.png 1986;120;Comic Relief;Comedy;Crystal, Billy;Goldberg, Whoopi;;69;No;NicholasCage.png 1978;117;Bloodbrothers;Drama;Sorvino, Paul;Goldoni, Lelia;Mulligan, Robert;11;No;NicholasCage.png 1988;134;Rain Man;Drama;Hoffman, Dustin;Golino, Valeria;Levinson, Barry;8;Yes;NicholasCage.png 1966;95;Masculine Feminine;Drama;Leaud, Jean-Pierre;Goya, Chantal;Godard, Jean-Luc;20;No;NicholasCage.png 1964;51;Outer Limits, The;Science Fiction;Perrin, Vic;Grahame, Gloria;Stanley, Paul;27;No;NicholasCage.png 1988;;Mama's Dirty Girls;Horror;Currie, Sondra;Grahame, Gloria;;62;No;NicholasCage.png 1979;180;Last Ride of the Dalton Gang, The;Western;Palance, Jack;Greenbush, Lindsay;Curtis, Dan;62;No;NicholasCage.png 1991;;Why Me?;Comedy;Lambert, Christopher;Greist, Kim;;74;No;NicholasCage.png 1932;66;Number Seventeen;Crime;Lion, Leon M.;Grey, Anne;Hitchcock, Alfred;66;No;alfredHitchcock.png 1986;120;Manhunter;Drama;Petersen, William L.;Griest, Kim;Mann, Michael;19;No;NicholasCage.png 1990;126;Bonfire of the Vanities, The;Drama;Hanks, Tom;Griffith, Melanie;De Palma, Brian;82;No;NicholasCage.png 1988;115;Working Girl;Comedy;Ford, Harrison;Griffith, Melanie;Nichols, Mike;25;No;NicholasCage.png 1992;133;Shining Through;Mystery;Douglas, Michael;Griffith, Melanie;Seltzer, David;11;No;NicholasCage.png 1991;76;Slumber Party Massacre III;Horror;Christian, Keely;Grye, Brittain;;40;No;NicholasCage.png 1988;99;Tokyo Pop;Comedy;Tadokoro, Yutaka;Hamilton, Carrie;Kuzui, Fran Rubel;2;No;NicholasCage.png 1991;136;Terminator 2;Action;Schwarzenegger, Arnold;Hamilton, Linda;Cameron, James;8;No;T2.png 1984;108;Terminator, The;Action;Schwarzenegger, Arnold;Hamilton, Linda;Cameron, James;17;No;T2.png 1986;105;King Kong Lives!;Action;Kerwin, Brian;Hamilton, Linda;Guillermin, John;20;No;NicholasCage.png 1969;125;Those Daring Young Men in Their Jaunty;Comedy;Curtis, Tony;Hampshire, Susan;;59;No;NicholasCage.png 1991;186;At Play in the Fields of the Lord;Drama;Berenger, Tom;Hannah, Daryl;Babenco, Hector;81;No;NicholasCage.png 1990;;Crazy People;Comedy;Moore, Dudley;Hannah, Daryl;Bill, Tony;61;No;NicholasCage.png 1992;99;Memoirs of an Invisible Man;Comedy;Chase, Chevy;Hannah, Daryl;Carpenter, John;58;No;NicholasCage.png 1985;100;Clan of the Cave Bear, The;Drama;Remar, James;Hannah, Daryl;Chapman, Michael;73;No;NicholasCage.png 1983;82;Final Terror, The;Horror;Zmed, Adrian;Hannah, Daryl;Davis, Andrew;24;No;NicholasCage.png 1984;93;Reckless;Drama;Quinn, Aidan;Hannah, Daryl;Foley, James;14;No;NicholasCage.png 1989;;High Spirits;Comedy;O'Toole, Peter;Hannah, Daryl;Jordan, Neil;53;No;NicholasCage.png 1987;107;Roxanne;Comedy;Martin, Steve;Hannah, Daryl;Schepisi, Fred;66;No;NicholasCage.png 1982;117;Blade Runner;Action;Ford, Harrison;Hannah, Daryl;Scott, Ridley;1;No;NicholasCage.png 1987;126;Wall Street;Drama;Douglas, Michael;Hannah, Daryl;Stone, Oliver;6;Yes;NicholasCage.png 1992;111;Pope of Greenwich Village;Drama;Rourke, Mickey;Hannah, Daryl;;58;No;NicholasCage.png 1989;89;After School;Drama;Bottoms, Sam;Hannah, Page;;59;No;NicholasCage.png 1938;298;Flaming Frontiers;Western;Brown, Johnny Mack;Hansen, Eleanor;Taylor, Ray;82;No;NicholasCage.png 1936;89;Libeled Lady;Comedy;Powell, William;Harlow, Jean;Conway, Jack;86;No;NicholasCage.png 1976;99;Inserts;Drama;Dreyfuss, Richard;Harper, Jessica;Byrum, John;85;No;NicholasCage.png 1988;88;Blue Iguana, The;Drama;McDermott, Dylan;Harper, Jessica;Lafia, John;65;No;NicholasCage.png 1983;93;Tender Mercies;Drama;Duvall, Robert;Harper, Tess;Beresford, Bruce;61;Yes;NicholasCage.png 1987;96;Nights in White Satin;Drama;Gilman, Kenneth;Harris, Priscilla;Barnard, Michael;5;No;NicholasCage.png 1989;87;Videodrome;Horror;Woods, James;Harry, Deborah;Cronenberg, David;36;No;NicholasCage.png 1991;96;Intimate Stranger;Mystery;Russo, James;Harry, Deborah;Holzman, Allan;23;No;NicholasCage.png 1986;110;Highlander;Science Fiction;Lambert, Christopher;Hart, Roxanne;Mulcahy, Russell;8;No;NicholasCage.png 1987;93;Bodycount;Action;White, Bernie;Hassett, Marilyn;;51;No;NicholasCage.png 1989;104;Tango & Cash;Action;Stallone, Sylvester;Hatcher, Teri;Konchalovsky, Andrei;9;No;NicholasCage.png 1970;94;There's a Girl in My Soup;Comedy;Sellers, Peter;Hawn, Goldie;Boulting, Roy;41;No;NicholasCage.png 1984;100;Swing Shift;Drama;Russell, Kurt;Hawn, Goldie;Demme, Jonathan;81;No;NicholasCage.png 1978;112;Foul Play;Comedy;Chase, Chevy;Hawn, Goldie;Higgins, Colin;46;No;NicholasCage.png 1982;109;Best Friends;Comedy;Reynolds, Burt;Hawn, Goldie;Jewison, Norman;74;No;NicholasCage.png 1972;109;Butterflies Are Free;Drama;Albert, Edward;Hawn, Goldie;Katselas, Milton;82;Yes;NicholasCage.png 1987;112;Overboard;Comedy;Russell, Kurt;Hawn, Goldie;Marshall, Garry;6;No;NicholasCage.png 1974;103;Girl from Petrovka, The;Drama;Holbrook, Hal;Hawn, Goldie;Miller, Robert Ellis;23;No;NicholasCage.png 1992;102;Housesitter;Comedy;Martin, Steve;Hawn, Goldie;Oz, Frank;14;No;NicholasCage.png 1986;106;Wildcats;Comedy;Keach, James;Hawn, Goldie;Ritchie, Michael;22;No;NicholasCage.png 1984;100;Protocol;Comedy;Sarandon, Chris;Hawn, Goldie;Ross, Herbert;53;No;NicholasCage.png 1980;102;Seems Like Old Times;Comedy;Chase, Chevy;Hawn, Goldie;Sandrich, Jay;49;No;NicholasCage.png 1974;109;Sugarland Express, The;Drama;Johnson, Ben;Hawn, Goldie;Spielberg, Steven;28;No;NicholasCage.png 1980;110;Private Benjamin;Comedy;Assante, Armand;Hawn, Goldie;Zieff, Howard;61;No;NicholasCage.png 1991;115;Deceived;Mystery;Heard, John;Hawn, Goldie;;55;No;NicholasCage.png 1931;95;Arrowsmith;Drama;Colman, Ronald;Hayes, Helen;Ford, John;84;No;johnFord.png 1972;78;Say Goodbye Maggie Cole;Drama;McGavin, Darren;Hayward, Susan;Taylor, Jud;84;No;NicholasCage.png 1964;132;Circus World;Drama;Wayne, John;Hayworth, Rita;Hathaway, Henry;29;No;johnWayne.png 1952;98;Affair in Trinidad;Drama;Ford, Glenn;Hayworth, Rita;Sherman, Vincent;49;No;glennFord.png 1948;87;Lady from Shanghai;Mystery;Welles, Orson;Hayworth, Rita;Welles, Orson;16;No;NicholasCage.png 1940;81;Lady in Question;Drama;Aherne, Brian;Hayworth, Rita;Vidor, Charles;57;No;NicholasCage.png 1946;110;Gilda;Drama;Ford, Glenn;Hayworth, Rita;Vidor, Charles;57;No;glennFord.png 1948;98;Loves of Carmen, The;Drama;Ford, Glenn;Hayworth, Rita;Vidor, Charles;48;No;glennFord.png 1990;105;Dick Tracy;Comedy;Beatty, Warren;Headley, Glenne;Beatty, Warren;84;No;NicholasCage.png 1964;130;Marnie;Drama;Connery, Sean;Hedren, Tippi;Hitchcock, Alfred;2;No;seanConnery.png 1987;85;Hot Child in the City;Mystery;Prysirr, Geof;Hendrix, Leah Ayres;Florea, John;0;No;NicholasCage.png 1984;90;Johnny Dangerously;Comedy;Piscopo, Joe;Henner, Marilu;Heckerling, Amy;3;No;NicholasCage.png 1985;95;Stark;Mystery;Surovy, Nicolas;Henner, Marilu;Holcomb, Rod;27;No;NicholasCage.png 1949;84;Three Strange Loves;Drama;Malmsten, Birger;Henning, Eva;Bergman, Ingmar;87;No;Bergman.png 1964;170;My Fair Lady;Music;Harrison, Rex;Hepburn, Audrey;Cukor, George;10;Yes;NicholasCage.png 1960;123;Unforgiven, The;Drama;Lancaster, Burt;Hepburn, Audrey;Huston, John;32;No;burtLancaster.png 1976;106;Robin & Marian;Action;Connery, Sean;Hepburn, Audrey;Lester, Richard;6;No;seanConnery.png 1961;109;Children's Hour, The;Drama;Garner, James;Hepburn, Audrey;Wyler, William;60;No;NicholasCage.png 1956;121;Rainmaker, The;Drama;Lancaster, Burt;Hepburn, Katharine;Anthony, Joseph;21;No;katharineHepburn.png 1952;95;Pat & Mike;Comedy;Tracy, Spencer;Hepburn, Katharine;Cukor, George;48;No;spencerTracy.png 1968;134;Lion in Winter, THe;Drama;O'Toole, Peter;Hepburn, Katharine;Harvey, Anthony;78;Yes;katharineHepburn.png 1991;132;Sea of Grass, The;Western;Tracy, Spencer;Hepburn, Katharine;Kazan, Elia;75;No;spencerTracy.png 1967;108;Guess Who's Coming to Dinner;Drama;Tracy, Spencer;Hepburn, Katharine;Kramer, Stanley;50;Yes;spencerTracy.png 1957;153;Desk Set;Comedy;Tracy, Spencer;Hepburn, Katharine;Lang, Walter;51;No;spencerTracy.png 1975;107;Rooster Cogburn;Western;Wayne, John;Hepburn, Katharine;Miller, Stuart;76;No;johnWayne.png 1981;109;On Golden Pond;Drama;Fonda, Henry;Hepburn, Katharine;Rydell, Mark;23;Yes;katharineHepburn.png 1991;101;Adam's Rib;Comedy;Tracy, Spencer;Hepburn, Katharine;;62;No;spencerTracy.png 1991;116;Boom Town;Drama;Tracy, Spencer;Hepburn, Katharine;;73;No;katharineHepburn.png 1991;145;Dragon Seed;Drama;Tracy, Spencer;Hepburn, Katharine;;34;No;katharineHepburn.png 1991;115;Little Women;Drama;Tracy, Spencer;Hepburn, Katharine;;22;No;katharineHepburn.png 1991;113;Philadelphia Story, The;Comedy;Tracy, Spencer;Hepburn, Katharine;;25;No;katharineHepburn.png 1991;112;Without Love;Comedy;Tracy, Spencer;Hepburn, Katharine;;66;No;katharineHepburn.png 1991;113;Woman of the Year;Comedy;Tracy, Spencer;Hepburn, Katharine;;12;No;spencerTracy.png 1992;95;Juice;Drama;Shakur, Tupac;Herron, Cindy;Dickerson, Ernest R.;31;No;NicholasCage.png 1986;114;Hoosiers;Drama;Hackman, Gene;Hershey, Barbara;Anspaugh, David;2;No;NicholasCage.png 1987;112;Tin Men;Comedy;Dreyfuss, Richard;Hershey, Barbara;Levinson, Barry;50;No;NicholasCage.png 1988;163;Last Temptation of Christ, The;Drama;Dafoe, Willem;Hershey, Barbara;Scorsese, Martin;32;No;NicholasCage.png 1991;99;Paris Trout;Drama;Hopper, Dennis;Hershey, Barbara;;53;No;NicholasCage.png 1988;87;Souvenir;Drama;Plummer, Christopher;Hicks, Catherine;Reeve, Geoffrey;42;No;NicholasCage.png 1966;120;A Man for All Seasons;Drama;Shaw, Robert;Hiller, Wendy;Zinnemann, Fred;20;Yes;NicholasCage.png 1986;90;Knights & Emeralds;Drama;Leadbitter, Bill;Hills, Beverly;Emes, Ian;;No;NicholasCage.png 1989;83;Masque of the Red Death;Horror;MacNee, Patrick;Hoak, Clare;Brand, Larry;9;No;NicholasCage.png 1943;265;Adventures of Smilin' Jack, The;Mystery;Brown, Tom;Hobart, Rose;Taylor, Ray;77;No;NicholasCage.png 1992;88;Adventures in Dinosaur City;Action;Katz, Omri;Hoffman, Shawn;Thompson, Brett;19;No;NicholasCage.png 1987;95;Allnighter, The;Comedy;Terlesky, John;Hoffs, Susanna;Hoffs, Tamar Simon;71;No;NicholasCage.png 1980;99;Caddyshack;Comedy;Chase, Chevy;Holcomb, Sarah;Ramis, Harold;70;No;NicholasCage.png 1973;102;Tom Sawyer;Music;Whitaker, Johnny;Holm, Celeste;Taylor, Don;11;No;NicholasCage.png 1987;94;Rita, Sue & Bob Too;Comedy;Finneran, Siohban;Holmes, Michelle;Clarke, Alan;5;No;NicholasCage.png 1947;56;Hawk of Powder River;Western;Dean, Eddie;Holt, Jennifer;Taylor, Ray;61;No;NicholasCage.png 1928;148;Tempest;Drama;Barrymore, John;Horn, Camilla;Taylor, Sam;33;No;NicholasCage.png 1986;90;Running Mates;Drama;Webb, Greg;Howard, Barbara;Neff, Thomas L.;63;No;NicholasCage.png 1987;105;Prettykill;Drama;Birney, David;Hubley, Season;Kaczender, George;71;No;NicholasCage.png 1934;80;Judge Priest;Drama;Rogers, Will;Hudson, Rochelle;Ford, John;9;No;johnFord.png 1950;104;Harvey;Comedy;Stewart, James;Hull, Josephine;Koster, Henry;42;No;NicholasCage.png 1991;89;If Looks Could Kill;Action;Grieco, Richard;Hunt, Linda;Wilmington, Michael;10;No;NicholasCage.png 1987;94;Raising Arizona;Comedy;Cage, Nicolas;Hunter, Holly;Coen, Joel;23;No;NicholasCage.png 1989;114;Once Around;Comedy;Dreyfuss, Richard;Hunter, Holly;Hallström, Lasse;68;No;NicholasCage.png 1980;110;Loulou;Drama;Depardieu, Gérard;Huppert, Isabelle;Pialat, Maurice;65;No;NicholasCage.png 1982;136;World According to Garp, The;Drama;Williams, Robin;Hurt, Mary Beth;Hill, George Roy;59;No;NicholasCage.png 1980;106;Virus;Science Fiction;Kennedy, George;Hussey, Olivia;Fukasaku, Kinji;62;No;NicholasCage.png 1940;127;Northwest Passage;Action;Tracy, Spencer;Hussey, Ruth;Vidor, King;51;No;spencerTracy.png 1987;112;Gardens of Stone;Drama;Caan, James;Huston, Anjelica;Coppola, Francis Ford;27;No;NicholasCage.png 1989;121;Enemies, a Love Story;Drama;Silver, Ron;Huston, Anjelica;Mazursky, Paul;5;No;NicholasCage.png 1992;102;Addams Family, The;Comedy;Julia, Raul;Huston, Anjelica;Sonnenfeld, B.;8;No;NicholasCage.png 1932;65;Freaks;Horror;Ford, Wallace;Hyams, Leila;Browning, Tod;61;No;NicholasCage.png 1991;108;Necessary Roughness;Comedy;Bakula, Scott;Ireland, Kathy;Dragoti, Stan;60;No;NicholasCage.png 1990;93;A Show of Force;Drama;Garcia, Andy;Irving, Amy;Barreto, Bruno;1;No;NicholasCage.png 1980;129;Competition, The;Drama;Dreyfuss, Richard;Irving, Amy;Oliansky, Joel;45;No;NicholasCage.png 1988;97;Crossing Delancey;Comedy;Riegert, Peter;Irving, Amy;Silver, Joan Micklin;6;No;NicholasCage.png 1982;120;State of Things, The;Drama;Kime, Jeffrey;Isabelle Weingarten.;Wenders, Wim;73;No;NicholasCage.png 1987;89;Business As Usual;Comedy;Thaw, John;Jackson, Glenda;Barrett, Lezli-An;17;No;NicholasCage.png 1973;103;A Touch of Class;Comedy;Segal, George;Jackson, Glenda;Frank, Melvin;79;Yes;NicholasCage.png 1970;129;Women in Love.;Drama;Bates, Alan;Jackson, Glenda;Russell, Ken;18;No;NicholasCage.png 1988;89;Salome's Last Dance;Comedy;Johns, Stratford;Jackson, Glenda;Russell, Ken;76;No;NicholasCage.png 1986;100;Casino;Mystery;Connors, Mike;Jackson, Sherry;Chaffey, Don;5;No;NicholasCage.png 1955;108;Smiles of a Summer Night;Comedy;Björnstrand, Gunnar;Jacobsson, Ulla;Bergman, Ingmar;58;No;Bergman.png 1989;90;New Year's Day;Comedy;Jaglom, Henry;Jakobsen, Maggie;Jaglom, Henry;88;No;NicholasCage.png 1981;132;Mephisto;Drama;Brandauer, Klaus Maria;Janda, Krystyna;Szabó, István;80;Yes;NicholasCage.png 1927;60;Easy Virtue;Mystery;Dyall, Franklin;Jeans, Isabel;Hitchcock, Alfred;45;No;alfredHitchcock.png 1937;59;Swing It Sailor!;Comedy;Ford, Wallace;Jewell, Isabel;;6;No;NicholasCage.png 1991;83;Strictly Business;Comedy;Davidson, Tommy;Johnson, Anne-Marie;Hooks, Kevin;3;No;NicholasCage.png 1983;90;Blame It on Rio;Comedy;Caine, Michael;Johnson, Michelle;Donen, Stanley;10;No;NicholasCage.png 1987;86;Straight to Hell;Action;Hopper, Dennis;Jones, Grace;Cox, Alex;47;No;NicholasCage.png 1990;131;A View to a Kill;Action;Moore, Roger;Jones, Grace;;44;No;NicholasCage.png 1986;100;American Anthem;Drama;Gaylord, Mitch;Jones, Janet;Magnoli, Albert;74;No;NicholasCage.png 1963;99;Bedtime Story;Comedy;Brando, Marlon;Jones, Shirley;Levy, Ralph;7;No;brando.png 1991;117;Courtship of Eddie's Father, The;Comedy;Howard, Ron;Jones, Shirley;;43;No;NicholasCage.png 1988;102;Night Train to Katmandu, THe;Action;Roberts, Pernell;Jovovich, Milla;Wiemer, Robert;43;No;NicholasCage.png 1948;100;Port of Call;Drama;Eklund, Bengt;Jönsson, Nine-Christine;Bergman, Ingmar;29;No;Bergman.png 1973;103;Paper Moon;Comedy;O'Neal, Ryan;Kahn, Madeline;Bogdanovich, Peter;3;Yes;NicholasCage.png 1983;97;Yellowbeard;Comedy;Chapman, Graham;Kahn, Madeline;Damski, Mel;34;No;NicholasCage.png 1975;91;Adventures of Sherlock Holmes' Smarter;Comedy;Wilder, Gene;Kahn, Madeline;Wilder, Gene;42;No;NicholasCage.png 1990;108;Flashback;Comedy;Hopper, Dennis;Kane, Carol;Amurri, Franco;19;No;NicholasCage.png 1977;89;World's Greatest Lover, The;Comedy;Wilder, Gene;Kane, Carol;Wilder, Gene;42;No;NicholasCage.png 1955;67;Killer's Kiss;Mystery;Silvera, Frank;Kane, Irene;Kubrick, Stanley;66;No;NicholasCage.png 1988;103;Deceivers, The;Action;Brosnan, Pierce;Kapoor, Shashi;Meyer, Nicholas;14;No;NicholasCage.png 1983;97;Breathless;Action;Gere, Richard;Kaprisky, Valerie;McBride, Jim;51;No;NicholasCage.png 1989;145;Born on the Fourth of July;Drama;Cruise, Tom;Kava, Caroline;Stone, Oliver;8;Yes;NicholasCage.png 1991;120;Awakenings;Drama;De Niro, Robert;Kavner, Julie;Marshall, Penny;8;No;NicholasCage.png 1977;94;Annie Hall;Comedy;Allen, Woody;Keaton, Diane;Allen, Woody;68;Yes;woody.png 1979;96;Manhattan;Comedy;Allen, Woody;Keaton, Diane;Allen, Woody;82;Yes;woody.png 1981;195;Reds;Drama;Beatty, Warren;Keaton, Diane;Beatty, Warren;76;Yes;NicholasCage.png 1986;105;Crimes of the Heart;Comedy;Shepard, Sam;Keaton, Diane;Beresford, Bruce;84;No;NicholasCage.png 1977;136;Looking for Mr. Goodbar;Drama;Atherton, William;Keaton, Diane;Brooks, Richard;54;No;NicholasCage.png 1972;175;Godfather, The;Drama;Brando, Marlon;Keaton, Diane;Coppola, Francis Ford;8;Yes;brando.png 1974;201;Godfather, Pt 2., The;Drama;Pacino, Al;Keaton, Diane;Coppola, Francis Ford;8;Yes;NicholasCage.png 1976;109;I Will, I Will...For Now;Comedy;Gould, Elliott;Keaton, Diane;Panama, Norman;6;No;NicholasCage.png 1972;86;Play It Again, Sam;Comedy;Allen, Woody;Keaton, Diane;Ross, Herbert;81;No;woody.png 1975;82;Love & Death;Comedy;Allen, Woody;Keaton, Diane;;84;No;woody.png 1973;88;Sleeper;Comedy;Allen, Woody;Keaton, Diane;;59;No;woody.png 1970;130;Fellini Satyricon;Drama;Potter, Martin;Keller, Hiram;Fellini, Federico;88;No;NicholasCage.png 1980;117;Formula, The;Mystery;Scott, George C.;Keller, Marthe;Avildsen, John G.;82;No;NicholasCage.png 1977;143;Black Sunday;Drama;Shaw, Robert;Keller, Marthe;Frankenheimer, John;76;No;NicholasCage.png 1977;124;Bobby Deerfield;Drama;Pacino, Al;Keller, Marthe;Pollack, Sydney;36;No;NicholasCage.png 1972;98;Last of the Red Hot Lovers;Comedy;Arkin, Alan;Kellerman, Sally;Saks, Gene;40;No;NicholasCage.png 1953;116;Mogambo;Action;Gable, Clark;Kelly, Grace;Ford, John;71;No;johnFord.png 1955;103;To Catch a Thief;Mystery;Grant, Cary;Kelly, Grace;Hitchcock, Alfred;69;No;alfredHitchcock.png 1954;113;Rear Window;Mystery;Stewart, James;Kelly, Grace;Hitchcock, Alfred;25;No;alfredHitchcock.png 1945;69;Woman Who Came Back;Drama;Kruger, Otto;Kelly, Nancy;Colmes, Walter;26;No;NicholasCage.png 1939;101;Stanley & Livingstone;Action;Tracy, Spencer;Kelly, Nancy;King, Henry;11;No;spencerTracy.png 1956;129;Bad Seed, The;Horror;Jones, Henry;Kelly, Nancy;LeRoy, Mervyn;69;No;NicholasCage.png 1989;113;Lethal Weapon 2;Action;Gibson, Mel;Kensit, Patsy;Donner, Richard;69;No;NicholasCage.png 1992;79;Blame It on the Bellboy;Comedy;Moore, Dudley;Kensit, Patsy;Herman, Mark;69;No;NicholasCage.png 1927;62;Drop Kick, The;Drama;Barthelmess, Richard;Kent, Barbara;Webb, Millard;;No;NicholasCage.png 1978;145;Superman, The Movie;Action;Brando, Marlon;Kidder, Margot;Donner, Richard;87;No;brando.png 1987;90;Superman IV: The Quest for Peace;Action;Reeve, Christopher;Kidder, Margot;Furie, Sidney J.;77;No;NicholasCage.png 1970;90;Quackser Fortune Has a Cousin in the Bronx;Comedy;Wilder, Gene;Kidder, Margot;Waris, Hussein;49;No;NicholasCage.png 1989;96;Dead Calm;Mystery;Neill, Sam;Kidman, Nicole;Noyce, Phillip;1;No;NicholasCage.png 1990;107;Days of Thunder;Action;Cruise, Tom;Kidman, Nicole;Scott, Tony;3;No;NicholasCage.png 1987;101;My Life As a Dog;Comedy;Glanzelius, Anton;Kinnaman, Melinda;Hallström, Lasse;21;No;NicholasCage.png 1983;;Moon in the Gutter, The;Action;Depardieu, Gérard;Kinski, Nastassia;Beineix, Jean-Jacques;29;No;NicholasCage.png 1984;150;Paris, Texas;Drama;Stanton, Harry Dean;Kinski, Nastassia;Wenders, Wim;27;No;NicholasCage.png 1984;96;Unfaithfully Yours;Comedy;Moore, Dudley;Kinski, Nastassia;Zieff, Howard;73;No;NicholasCage.png 1987;95;Bullseye!;Comedy;Caine, Michael;Kirkland, Sally;Winner, Michael;8;No;NicholasCage.png 1989;104;Erik the Viking;Action;Robbins, Tim;Kitt, Eartha;Jones, Terry;25;No;NicholasCage.png 1987;90;Dragonard;Drama;Reed, Oliver;Kitt, Eartha;Kikoine, Gerard;71;No;NicholasCage.png 1986;90;Hard Choices;Drama;McCleery, Gary;Klenck, Margaret;King, Rick;41;No;NicholasCage.png 1969;102;Rain People, The;Drama;Caan, James;Knight, Shirley;Coppola, Francis Ford;78;No;NicholasCage.png 1984;106;A Year of the Quiet Sun;Drama;Wilson, Scott;Komorowska, Maja;Zanussi, Krzystoff;78;No;NicholasCage.png 1935;54;Desert Trail, The;Western;Wayne, John;Kornman, Mary;Collins, Lewis D.;50;No;johnWayne.png 1990;98;Almost an Angel;Comedy;Hogan, Paul;Kozlowski, Linda;Cornell, John;14;No;NicholasCage.png 1986;98;Crocodile Dundee;Comedy;Hogan, Paul;Kozlowski, Linda;Faiman, Peter;66;No;NicholasCage.png 1977;127;American Friend, The;Mystery;Hopper, Dennis;Kreuzer, Lisa;Wenders, Wim;35;No;NicholasCage.png 1989;119;See You in the Morning;Drama;Bridges, Jeff;Krige, Alice;Pakula, Alan J.;53;No;NicholasCage.png 1987;88;Arrogant, The;Drama;Graham, Gary;Kristel, Sylvia;Blot, Philippe;62;No;NicholasCage.png 1989;86;Dracula's Widow;Horror;Sommer, Josef;Kristel, Sylvia;Coppola, Christopher;55;No;NicholasCage.png 1987;90;Ninja Masters of Death;Action;Peterson, Chris;Kruize, Kelly;Lambert, Bruce;15;No;NicholasCage.png 1990;110;Mystery Train;Comedy;Nagase, Masatoshi;Kudoh, Youki;Jarmusch, Jim;23;No;NicholasCage.png 1978;114;Go Tell the Spartans;War;Lancaster, Burt;Kumagai, Denice;Post, Ted;67;No;burtLancaster.png 1986;89;True Stories;Comedy;Goodman, John;Kurtz, Swoosie;Byrne, David;79;No;NicholasCage.png 1953;94;Ugetsu Monogatari;Drama;Mori, Masayuki;Kyô, Machiki;Mizoguchi, Kenji;82;No;NicholasCage.png 1969;80;Rebel Rousers;Action;Nicholson, Jack;Ladd, Diane;Cohen, Martin B.;44;No;JackNicholson.png 1988;98;Plain Clothes;Comedy;Howard, Arliss;Ladd, Diane;Coolidge, Martha;4;No;NicholasCage.png 1981;119;Whose Life Is It, Anyway?;Drama;Dreyfuss, Richard;Lahti, Christine;Badham, John;62;No;NicholasCage.png 1988;116;Running on Empty;Drama;Hirsch, Judd;Lahti, Christine;Lumet, Sidney;2;No;NicholasCage.png 1990;101;Funny about Love;Comedy;Wilder, Gene;Lahti, Christine;Nimoy, Leonard;60;No;NicholasCage.png 1985;118;A Chorus Line, The Movie;Music;Douglas, Michael;Landers, Audrey;Attenborough, Richard;71;No;NicholasCage.png 1986;84;Stewardess School;Comedy;Most, Donald;Landers, Judy;Blancato, Ken;28;No;NicholasCage.png 1987;109;Big Town, The;Drama;Dillon, Matt;Lane, Diane;Bolt, Ben;11;No;NicholasCage.png 1983;94;Rumble Fish;Drama;Dillon, Matt;Lane, Diane;Coppola, Francis Ford;4;No;NicholasCage.png 1983;91;Outsiders, The;Drama;Howell, C. Thomas;Lane, Diane;Coppola, Francis Ford;56;No;NicholasCage.png 1990;94;Priceless Beauty;Science Fiction;Lambert, Christopher;Lane, Diane;Finch, Charles;7;No;NicholasCage.png 1989;93;Streets of Fire;Action;Paré, Michael;Lane, Diane;Hill, Walter;65;No;NicholasCage.png 1990;115;Men Don't Leave;Drama;Howard, Arliss;Lange, Jessica;Brickman, Paul;66;No;NicholasCage.png 1988;127;Everybody's All American;Romance;Quaid, Dennis;Lange, Jessica;Hackford, Taylor;62;No;NicholasCage.png 1992;128;Cape Fear;Mystery;De Niro, Robert;Lange, Jessica;Scorsese, Martin;7;No;NicholasCage.png 1992;121;Postman Always Rings Twice, The;Mystery;Nicholson, Jack;Lange, Jessica;;24;No;NicholasCage.png 1949;58;Crashing Thru;Western;Wilson, Whip;Larson, Christine;Taylor, Ray;19;No;NicholasCage.png 1978;109;Get Out Your Handkerchiefs;Comedy;Depardieu, Gérard;Laure, Carole;Blier, Bertrand;78;Yes;NicholasCage.png 1971;137;Boy Friend, THe;Music;Gable, Christopher;Lawson, Twiggy;Russell, Ken;8;No;NicholasCage.png 1990;100;Hard To Kill;Action;Seagal, Steven;LeBrock, Kelly;Malmuth, Bruce;49;No;NicholasCage.png 1960;109;Psycho;Horror;Perkins, Anthony;Leigh, Janet;Hitchcock, Alfred;56;No;alfredHitchcock.png 1957;112;Jet Pilot;Action;Wayne, John;Leigh, Janet;Sternberg, Josef von;43;No;johnWayne.png 1987;95;Under Cover;Mystery;Neidorf, David;Leigh, Jennifer Jason;Stockwell, John;36;No;NicholasCage.png 1951;122;A Streetcar Named Desire;Drama;Brando, Marlon;Leigh, Vivien;Kazan, Elia;75;Yes;brando.png 1986;93;Golden Child, The;Comedy;Murphy, Eddie;Lewis, Charlotte;Ritchie, Michael;86;No;NicholasCage.png 1971;84;Statue, The;Drama;Niven, David;Lisi, Virna;Amateau, Rod;80;No;NicholasCage.png 1985;128;Christopher Columbus;Drama;Byrne, Gabriel;Lisi, Virna;Lattuada, Alberto;69;No;NicholasCage.png 1989;116;In Country;Drama;Willis, Bruce;Lloyd, Emily;Jewison, Norman;76;No;NicholasCage.png 1978;132;Wild Geese, The;Action;Burton, Richard;Lloyd, Rosalind;McLaglen, Andrew V.;21;No;NicholasCage.png 1974;90;Second Coming of Suzanne., The;Drama;Dreyfuss, Richard;Locke, Sondra;Barry, Michael;21;No;NicholasCage.png 1980;116;Bronco Billy;Westerns;Eastwood, Clint;Locke, Sondra;Eastwood, Clint;57;No;clintEastwood.png 1977;109;Gauntlet, The;Action;Eastwood, Clint;Locke, Sondra;Eastwood, Clint;18;No;clintEastwood.png 1986;105;Ratboy;Drama;Townsend, Robert;Locke, Sondra;Locke, Sondra;1;No;NicholasCage.png 1938;96;Lady Vanishes;Mystery;Redgrave, Michael;Lockwood, Margaret;Hitchcock, Alfred;27;No;alfredHitchcock.png 1987;95;Kitchen Toto, THe;Drama;Peck, Bob;Logan, Phyllis;Hook, Harry;41;No;NicholasCage.png 1959;88;Carlton-Browne of the F.O.;Comedy;Terry-Thomas;Lohr, Marie;Boulting, Roy;63;No;NicholasCage.png 1929;68;Racketeer;Drama;Armstrong, Robert;Lombard, Carole;Higgin, Howard;2;No;NicholasCage.png 1941;95;Mr. & Mrs. Smith;Comedy;Montgomery, Robert;Lombard, Carole;Hitchcock, Alfred;3;No;alfredHitchcock.png 1986;132;Alrededor de Medianoche;Drama;Francois Cluzet;Lonette McKee;Rayfield, David;47;No;NicholasCage.png 1982;101;Losin' It;Comedy;Cruise, Tom;Long, Shelley;Hanson, Curtis;4;No;NicholasCage.png 1987;114;Into the Homeland;Action;Boothe, Powers;Longstreth, Emily;Glatter, Lesli Linka;34;No;NicholasCage.png 1991;60;Boxing Babes;Action;Nichol, Robin;Lords, Traci;Dell, Stewart;9;No;NicholasCage.png 1991;94;Shock 'em Dead;Horror;Donahue, Troy;Lords, Traci;Freed, Mark;31;No;NicholasCage.png 1960;101;Heller in Pink Tights;Drama;Quinn, Anthony;Loren, Sophia;Cukor, George;52;No;sophiaLoren.png 1961;100;Two Women;Drama;Belmondo, Jean-Paul;Loren, Sophia;De Sica, Vittorio;83;Yes;sophiaLoren.png 1954;107;Gold of Naples, The;Drama;De Sica, Vittorio;Loren, Sophia;De Sica, Vittorio;40;No;sophiaLoren.png 1963;118;Yesterday, Today & Tomorrow;Comedy;Mastroianni, Marcello;Loren, Sophia;De Sica, Vittorio;73;Yes;sophiaLoren.png 1957;109;Legend of the Lost;Action;Wayne, John;Loren, Sophia;Hathaway, Henry;84;No;sophiaLoren.png 1978;111;Brass Target;Action;Cassavetes, John;Loren, Sophia;Hough, John;53;No;sophiaLoren.png 1964;188;Fall of the Roman Empire, The;Drama;Boyd, Stphen;Loren, Sophia;Mann, Anthony;62;No;sophiaLoren.png 1961;172;El Cid;Drama;Heston, Charlton;Loren, Sophia;Mann, Anthony;10;No;sophiaLoren.png 1958;114;Desire under the Elms;Drama;Perkins, Anthony;Loren, Sophia;Mann, Delbert;13;No;sophiaLoren.png 1953;92;Two Nights with Cleo;Drama;Sordi, Alberto;Loren, Sophia;Mattoli, Mario;54;No;sophiaLoren.png 1959;;Black Orchid, The;Drama;Quinn, Anthony;Loren, Sophia;Ritt, Martin;54;No;sophiaLoren.png 1977;91;Angela;Drama;Railsback, Steve;Loren, Sophia;Sagal, Boris;80;No;sophiaLoren.png 1977;105;A Special Day;Drama;Mastroianni, Marcello;Loren, Sophia;Scola, Ettore;80;Yes;sophiaLoren.png 1979;112;Blood Feud;Action;Mastroianni, Marcello;Loren, Sophia;Wertmuller, Lina;52;No;sophiaLoren.png 1991;145;Sophia Loren, Her Own Story;Drama;Gavin, John;Loren, Sophia;;49;No;sophiaLoren.png 1990;;Running Away;Drama;Loggia, Robert;Loren, Sophia;;2;No;sophiaLoren.png 1991;130;Man of La Mancha;Music;O'Toole, Peter;Loren, Sophia;;55;No;sophiaLoren.png 1992;116;Operation Crossbow;Action;Peppard, George;Loren, Sophia;;1;No;sophiaLoren.png 1986;141;Courage;Drama;Williams, Billy Dee;Loren, Sophia;;56;No;sophiaLoren.png 1986;94;RAD;Action;Allen, Bill;Loughlin, Lori;Needham, Hal;75;No;NicholasCage.png 1992;98;Secret Admirer;Comedy;Howell, C. Thomas;Loughlin, Lori;;55;No;NicholasCage.png 1979;85;Cocaine Cowboys;Action;Palance, Jack;Love, Suzanna;Lommel, Ulli;17;No;NicholasCage.png 1991;118;Test Pilot;Drama;Gable, Clark;Loy, Myrna;;13;No;NicholasCage.png 1943;64;Ape Man, The;Horror;Ford, Wallace;Lugosi, Bela;Beaudine, William;83;No;NicholasCage.png 1986;125;Mission, The;Drama;De Niro, Robert;Lunghi, Cherie;Joffe, Roland;20;No;NicholasCage.png 1991;102;Curly Sue;Comedy;Belushi, Jim;Lynch, Kelly;Hughes, John;2;No;NicholasCage.png 1962;150;Lolita;Drama;Mason, James;Lyon, Sue;Kubrick, Stanley;80;No;NicholasCage.png 1989;101;Sex, Lies, and Videotape;Drama;Spader, James;MacDowell, Andie;Soderbergh, Steven;70;Yes;NicholasCage.png 1990;107;Green Card;Comedy;Depardieu, Gérard;MacDowell, Andie;Weir, Peter;25;No;NicholasCage.png 1988;95;Gator Bait II;Action;Muzzcat, Paul;MacKenzie, Jan;Sebastian, Beverly;73;No;NicholasCage.png 1979;129;Being There;Comedy;Sellers, Peter;MacLaine, Shirley;Ashby, Hal;31;Yes;NicholasCage.png 1983;132;Terms of Endearment;Drama;Nicholson, Jack;MacLaine, Shirley;Brooks, James L.;32;Yes;JackNicholson.png 1967;99;Woman Times Seven;Comedy;Sellers, Peter;MacLaine, Shirley;De Sica, Vittorio;36;No;NicholasCage.png 1968;;Bliss of Mrs. Blossom, The;Comedy;Booth, James;MacLaine, Shirley;McGrath, Joseph;86;No;NicholasCage.png 1990;101;Postcards from the Edge;Comedy;Quaid, Dennis;MacLaine, Shirley;Nichols, Mike;63;No;NicholasCage.png 1970;105;Two Mules for Sister Sara;Western;Eastwood, Clint;MacLaine, Shirley;Siegel, Don;36;No;clintEastwood.png 1992;84;Dragonfight;Drama;Z'Dar, Robert;MacLaren, Fawna;;71;No;NicholasCage.png 1939;85;Back Door to Heaven;Drama;Ford, Wallace;MacMahon, Aline;Howard, William K.;83;No;NicholasCage.png 1988;100;Ciao Italia, Madonna Live from Italy;Music;;Madonna;De Winter, Harry;74;No;NicholasCage.png 1991;118;Madonna, Truth or Dare;Music;;Madonna;Keshishian, Alek;54;No;NicholasCage.png 1992;60;A Certain Sacrifice;Music;Pattnosh, Jeremy;Madonna;Lewicki, Steven Jon;24;No;NicholasCage.png 1991;40;National Enquirer, The Untold Story;Music;White, Vanna;Madonna;;65;No;NicholasCage.png 1990;60;Immaculate Collection, The;Music;;Madonna;;32;No;NicholasCage.png 1987;50;Madonna Live, The Virgin Tour;Music;;Madonna;;75;No;NicholasCage.png 1990;5;Madonna, Justify My Love;Music;;Madonna;;77;No;NicholasCage.png 1991;16;Madonna, Like a Virgin;Music;;Madonna;;63;No;NicholasCage.png 1988;83;Hot to Trot;Comedy;Goldthwait, Bob;Madsen, Virginia;Dinner, Michael;78;No;NicholasCage.png 1986;103;Fire with Fire;Drama;Sheffer, Craig;Madsen, Virginia;Gibbins, Duncan;9;No;NicholasCage.png 1990;120;Hot Spot;Drama;Johnson, Don;Madsen, Virginia;Hopper, Dennis;70;No;NicholasCage.png 1974;124;Amarcord;Drama;Noel, Magali;Maggio, Pupella;Fellini, Federico;50;Yes;NicholasCage.png 1988;85;Casablanca Express;Action;Connery, Jason;Maneri, Luisa;Martino, Sergio;33;No;NicholasCage.png 1980;94;Out of the Blue;Drama;Hopper, Dennis;Manz, Linda;Hopper, Dennis;4;No;NicholasCage.png 1949;110;Sands of Iwo Jima;War;Wayne, John;Mara, Adele;Dwan, Allan;72;No;johnWayne.png 1981;104;Hand, The;Horror;Caine, Michael;Marcovicci, Andrea;Stone, Oliver;44;No;NicholasCage.png 1989;81;Deep Cover;Mystery;Conti, Tom;Markham, Kika;Loncraine, Richard;15;No;NicholasCage.png 1955;92;Il Bidone;Drama;Crawford, Broderick;Masina, Guilietta;Fellini, Federico;70;No;NicholasCage.png 1986;130;El Guerrero Solitario;Drama;Eastwood, Clint;Mason, Marsha;Eastwood, Clint;77;No;clintEastwood.png 1986;130;Heartbreak Ridge;War;Eastwood, Clint;Mason, Marsha;Eastwood, Clint;61;No;clintEastwood.png 1977;110;Goodbye Girl, The;Comedy;Dreyfuss, Richard;Mason, Marsha;Ross, Herbert;6;Yes;NicholasCage.png 1991;113;Audrey Rose;Drama;Hopkins, Anthony;Mason, Marsha;;62;No;AnthonyHopkins.png 1981;86;Polyester;Comedy;Divine;Massey, Edith;;68;No;NicholasCage.png 1991;144;Robin Hood: Prince of Thieves;Action;Costner, Kevin;Mastrantonio, Mary Elizabeth;Costner, Kevin;8;No;NicholasCage.png 1992;101;White Sands;Drama;Dafoe, Willem;Mastrantonio, Mary Elizabeth;Donaldson, Roger;38;No;NicholasCage.png 1986;119;Color of Money, The;Drama;Newman, Paul;Mastrantonio, Mary Elizabeth;Scorsese, Martin;6;Yes;paulNewman.png 1986;119;Children of a Lesser God;Drama;Hurt, William;Matlin, Marlee;Haines, Randa;20;Yes;NicholasCage.png 1986;;Matador;Comedy;Banderas, Antonio;Maura, Carmen;Almodóvar, Pedro;34;No;NicholasCage.png 1989;88;Women on the Verge of a Nervous Breakdown;Comedy;Banderas, Antonio;Maura, Carmen;Almodóvar, Pedro;65;No;NicholasCage.png 1980;86;Pepi Luci Bom;Comedy;Rotaeta, Félix;Maura, Carmen;Almodóvar, Pedro;66;No;NicholasCage.png 1989;100;Forgotten, The;Mystery;Carradine, Keith;Maynard, Mimi;Keach, James;69;No;NicholasCage.png 1992;89;Flame & the Arrow, The;Action;Lancaster, Burt;Mayo, Virginia;;0;No;burtLancaster.png 1990;92;After the Shock;Drama;Kotto, Yaphet;McClanahan, Rue;Sherman, Gary;28;No;NicholasCage.png 1990;110;Modern Love;Comedy;Benson, Robby;McClanahan, Rue;;18;No;NicholasCage.png 1992;95;Riff Raff;Comedy;Carlyle, Robert;McCourt, Emer;Loach, Ken;71;No;NicholasCage.png 1967;81;Glory Stompers, The;Action;Hopper, Dennis;McCrea, Jody;Lanza, Anthony M.;27;No;NicholasCage.png 1990;181;Dances with Wolves;Western;Costner, Kevin;McDonnell, Mary;Costner, Kevin;8;Yes;NicholasCage.png 1987;130;Matewan;Drama;Jones, James Earl;McDonnell, Mary;Sayles, John;81;No;NicholasCage.png 1988;120;Mississippi Burning;Drama;Hackman, Gene;McDormand, Frances;Parker, Alan;41;Yes;NicholasCage.png 1975;130;Eiger Sanction, The;Action;Eastwood, Clint;McGee, Vonetta;Eastwood, Clint;69;No;clintEastwood.png 1988;109;Unsettled Land;Drama;Shea, John;McGillis, Kelly;Barbash, Uri;75;No;NicholasCage.png 1991;98;Cat Chaser;Drama;Weller, Peter;McGillis, Kelly;Ferrera, Abel;6;No;NicholasCage.png 1988;110;Accused, The;Drama;Coulson, Bernie;McGillis, Kelly;Kaplan, Jonathan;71;Yes;NicholasCage.png 1989;109;Winter People;Drama;Russell, Kurt;McGillis, Kelly;Kotcheff, Ted;30;No;NicholasCage.png 1983;101;Reuben, Reuben;Comedy;Conti, Tom;McGillis, Kelly;Miller, Robert Ellis;2;No;NicholasCage.png 1987;102;Made in Heaven;Fantasy;Hutton, Timothy;McGillis, Kelly;Rudolph, Alan;57;No;NicholasCage.png 1986;109;Top Gun;Action;Cruise, Tom;McGillis, Kelly;Scott, Tony;8;No;NicholasCage.png 1985;112;Witness;Drama;Ford, Harrison;McGillis, Kelly;Weir, Peter;59;No;NicholasCage.png 1988;111;House on Carroll Street, The;Mystery;Daniels, Jeff;McGillis, Kelly;;6;No;NicholasCage.png 1984;109;Racing with the Moon;Drama;Penn, Sean;McGovern, Elizabeth;Benjamin, Richard;50;No;NicholasCage.png 1983;98;Lovesick;Comedy;Moore, Dudley;McGovern, Elizabeth;Brickman, Marshall;51;No;NicholasCage.png 1988;106;She's Having a Baby;Comedy;Hughes, Kevin Bacon;McGovern, Elizabeth;;18;No;NicholasCage.png 1965;199;Greatest Story Ever Told, The;Drama;Sydow, Max von;McGuire, Dorothy;Stevens, George;26;No;NicholasCage.png 1989;105;Hawks;Drama;Dalton, Timothy;McTeer, Janet;Miller, Robert Ellis;11;No;NicholasCage.png 1981;91;So Fine;Comedy;O'Neal, Ryan;Melato, Mariangela;Bergman, Andrew;17;No;NicholasCage.png 1957;89;Paths of Glory;Drama;Douglas, Kirk;Menjou, Adolphe;Kubrick, Stanley;47;No;NicholasCage.png 1964;120;Tom Jones;Drama;Ustinov, Peter;Mercouri, Melina;Dassin, Jules;39;Yes;NicholasCage.png 1975;103;Sunshine Boys, The;Comedy;Burns, George;Meredith, Lee;Ross, Herbert;35;Yes;NicholasCage.png 1988;98;Caddyshack 2;Comedy;Mason, Jackie;Merrill, Dina;Arkush, Allan;34;No;NicholasCage.png 1990;117;Internal Affairs;Drama;Gere, Richard;Metcalf, Laurie;Figgis, Mike;3;No;NicholasCage.png 1991;206;JFK;Drama;Costner, Kevin;Metcalf, Laurie;Stone, Oliver;78;No;NicholasCage.png 1991;97;New Jack City;Action;Snipes, Wesley;Michael Michele;Van Peebles, Mario;80;No;NicholasCage.png 1991;87;Scenes from a Mall;Comedy;Allen, Woody;Midler, Bette;;8;No;woody.png 1987;118;Hope & Glory;War;Hayman, David;Miles, Sarah;Boorman, John;3;No;NicholasCage.png 1970;194;Ryan's Daughter;Drama;Mitchum, Robert;Miles, Sarah;Lean, David;81;Yes;NicholasCage.png 1973;127;Man Who Loved Cat Dancing, The;Western;Reynolds, Burt;Miles, Sarah;Sarafian, Richard C.;40;No;NicholasCage.png 1962;123;Man Who Shot Liberty Valance, The;Western;Stewart, James;Miles, Vera;Ford, John;85;No;johnFord.png 1989;102;Dead-Bang;Action;Johnson, Don;Miller, Penelope Ann;Frankenheimer, John;9;No;NicholasCage.png 1988;90;Big Top Pee-wee;Comedy;Reubens, Paul;Miller, Penelope Ann;Kleiser, Randal;17;No;NicholasCage.png 1960;103;Time Machine, The;Science Fiction;Taylor, Rod;Mimieux, Yvette;Pal, George;88;No;NicholasCage.png 1972;128;Cabaret;Drama;Grey, Joel;Minnelli, Liza;Fosse, Bob;59;Yes;NicholasCage.png 1981;97;Arthur;Comedy;Moore, Dudley;Minnelli, Liza;Gordon, Steve;79;Yes;NicholasCage.png 1976;97;A Matter of Time;Drama;Boyer, Charles;Minnelli, Liza;Minnelli, Vincente;70;No;NicholasCage.png 1977;137;New York, New York;Drama;De Niro, Robert;Minnelli, Liza;Scorsese, Martin;8;No;NicholasCage.png 1989;89;Nightmare on Elm Street, Pt. 5, The Dream Child;Horror;Englund, Robert;Minter, Kelly Jo;Hopkins, Stephen;41;No;NicholasCage.png 1980;100;Fiendish Plot of Dr. Fu Manchu, The;Comedy;Sellers, Peter;Mirren, Helen;Haggard, Piers;29;No;NicholasCage.png 1991;240;Four American Composers;Music;Cage, John;Monk, Meredith;Greenaway, Peter;3;No;NicholasCage.png 1950;112;Asphalt Jungle, The;Action;Hayden, Sterling;Monroe, Marilyn;Huston, John;77;No;NicholasCage.png 1992;61;Ladies of the Chorus;Music;Garr, Eddie;Monroe, Marilyn;Karlson, Phil;60;No;NicholasCage.png 1953;95;How to Marry a Millionaire;Comedy;Powell, William;Monroe, Marilyn;Negulesco, Jean;65;No;NicholasCage.png 1983;;Hollywood Out-Takes & Rare Footage;Comedy;Bogart, Humphrey;Monroe, Marilyn;;27;No;NicholasCage.png 1991;94;Nothing But Trouble;Comedy;Candy, John;Moore, Demi;Aykroyd, Dan;25;No;NicholasCage.png 1987;109;Wisdom;Action;Estevez, Emilio;Moore, Demi;Estevez, Emilio;25;No;NicholasCage.png 1986;94;One Crazy Summer;Comedy;Cusack, John;Moore, Demi;Holland, Savage Steve;61;No;NicholasCage.png 1989;110;We're No Angels;Comedy;De Niro, Robert;Moore, Demi;Jordan, Neil;51;No;NicholasCage.png 1984;102;No Small Affair;Comedy;Cryer, Jon;Moore, Demi;Schatzberg, Jerry;10;No;NicholasCage.png 1990;127;Ghost;Science Fiction;Swayze, Patrick;Moore, Demi;Zucker, Jerry;6;Yes;NicholasCage.png 1986;113;About Last Night;Drama;Lowe, Rob;Moore, Demi;Zwick, Edward;66;No;NicholasCage.png 1982;107;Six Weeks;Drama;Moore, Dudley;Moore, Mary Tyler;Bill, Tony;73;No;NicholasCage.png 1948;89;Return of October;Comedy;Ford, Glenn;Moore, Terry;Lewis, Joseph H.;35;No;glennFord.png 1952;99;Come Back, Little Sheba;Drama;Lancaster, Burt;Moore, Terry;Mann, Daniel;50;Yes;burtLancaster.png 1974;117;Going Places;Drama;Depardieu, Gérard;Moreau, Jeanne;Blier, Bertrand;66;No;NicholasCage.png 1970;99;Monte Walsh;Western;Marvin, Lee;Moreau, Jeanne;Fraker, William A.;29;No;NicholasCage.png 1955;100;Mr. Arkadin;Drama;Welles, Orson;Mori, Paola;Welles, Orson;80;No;NicholasCage.png 1988;;White of the Eye;Mystery;Keith, David;Moriarty, Cathy;Cammell, Donald;48;No;NicholasCage.png 1968;90;Producers, The;Comedy;Wilder, Gene;Mostel, Zero;Brooks, Mel;33;No;NicholasCage.png 1976;94;Front, The;Drama;Allen, Woody;Mostel, Zero;Ritt, Martin;70;No;woody.png 1987;86;House of the Rising Sun;Drama;Annese, Frank;Moyer, Tawny;Gold, Greg;45;No;NicholasCage.png 1988;91;In a Shallow Grave;Drama;Biehn, Michael;Mueller, Maureen;Bowser, Kenneth;72;No;NicholasCage.png 1974;111;Mc Q;Action;Wayne, John;Muldaur, Diana;Sturges, John;73;No;johnWayne.png 1941;85;Lady from Louisiana;Drama;Wayne, John;Munson, Ona;Vorhaus, Bernard;38;No;johnWayne.png 1990;102;Wait Until Spring Bandini;Drama;Mantegna, Joe;Muti, Ornella;Deruddere, Dominique;29;No;NicholasCage.png 1940;105;Long Voyage Home, The;Drama;Wayne, John;Natwick, Mildred;Ford, John;88;No;johnWayne.png 1955;100;Trouble with Harry, The;Mystery;Forsythe, John;Natwick, Mildred;Hitchcock, Alfred;28;No;alfredHitchcock.png 1987;60;Encounters;Drama;Von Bergan, Raven;Navarro, Monica;Marder, Bruce;44;No;NicholasCage.png 1963;112;Hud;Drama;Newman, Paul;Neal, Patricia;Ritt, Martin;2;Yes;paulNewman.png 1951;111;Operation Pacific;War;Wayne, John;Neal, Patricia;;5;No;johnWayne.png 1987;83;Surf Nazis Must Die;Horror;Brenner, Barry;Neely, Gail;George, Peter;50;No;NicholasCage.png 1956;124;Teahouse of the August Moon;Drama;Brando, Marlon;Negami, Jun;Mann, Daniel;11;No;brando.png 1992;88;Back in the U.S.S.R.;Action;Whaley, Frank;Negoda, Natalya;;61;No;NicholasCage.png 1970;91;Man Who Haunted Himself, The;Drama;Moore, Roger;Neil, Hildegard;Dearden, Basil;75;No;NicholasCage.png 1991;108;Prisoner of Honor.;Drama;Dreyfuss, Richard;Neilson, Catherine;Russell, Ken;58;No;NicholasCage.png 1988;83;Control;Drama;Lancaster, Burt;Nelligan, Kate;;27;No;burtLancaster.png 1923;57;Desert Rider;Western;Hoxie, Jack;Nelson, Evelyn;Bradbury, Robert N.;;No;NicholasCage.png 1980;109;Wholly Moses!;Comedy;Moore, Dudley;Newman, Laraine;Weis, Gary;25;No;NicholasCage.png 1991;110;Star Trek VI: The Undiscovered Country;Science Fiction;Shatner, William;Nichols, Nichelle;Meyer, Nicholas;11;No;NicholasCage.png 1989;107;Star Trek V: The Final Frontier;Action;Shatner, William;Nichols, Nichelle;Shatner, William ;87;No;NicholasCage.png 1991;85;Circuitry Man;Action;Metzler, Jim;Nicholson, Dana W.;Lovy, Steven;78;No;NicholasCage.png 1986;87;Cobra;Action;Stallone, Sylvester;Nielsen, Brigitte;Cosmatos, George P.;57;No;NicholasCage.png 1987;103;Beverly Hills Cop II;Comedy;Murphy, Eddie;Nielsen, Brigitte;Scott, Tony;37;No;NicholasCage.png 1990;90;Red Sonja;Action;Schwarzenegger, Arnold;Nielsen, Brigitte;;40;No;NicholasCage.png 1950;93;To Joy;Drama;Olin, Stig;Nilsson, Maj-Britt;Bergman, Ingmar;65;No;Bergman.png 1992;112;Macbeth;Drama;Welles, Orson;Nolan, Jeanette;;45;No;NicholasCage.png 1958;128;Vertigo;Drama;Stewart, James;Novak, Kim;Hitchcock, Alfred;10;No;alfredHitchcock.png 1987;91;Young Love: Lemon Popsicle Seven;Comedy;Katzur, Yftach;Noy, Zachi;Bennett, Walter;47;No;NicholasCage.png 1946;93;Crack-Up;Mystery;Marshall, Herbert;O'Brien, Pat;Reis, Irving;25;No;NicholasCage.png 1941;57;Bury Me Not on the Lone Prairie;Western;Brown, Johnny Mack;O'Day, Nell;Taylor, Ray;85;No;NicholasCage.png 1940;57;Law & Order;Western;Brown, Johnny Mack;O'Day, Nell;Taylor, Ray;87;No;NicholasCage.png 1941;56;Man from Montana;Western;Brown, Johnny Mack;O'Day, Nell;Taylor, Ray;85;No;NicholasCage.png 1992;137;Long Gray Line, The;Drama;Power, Tyrone;O'Hara, Maureen;Ford, John;26;No;johnFord.png 1950;105;Rio Grande;Western;Wayne, John;O'Hara, Maureen;Ford, John;64;No;johnWayne.png 1957;107;Wings of Eagles, The;Drama;Wayne, John;O'Hara, Maureen;Ford, John;29;No;johnWayne.png 1939;94;Jamaica Inn;Drama;Laughton, Charles;O'Hara, Maureen;Hitchcock, Alfred;75;No;alfredHitchcock.png 1971;110;Big Jake;Action;Wayne, John;O'Hara, Maureen;Sherman, George;68;No;johnWayne.png 1992;153;Quiet Man, The;Drama;Wayne, John;O'Hara, Maureen;;74;No;johnWayne.png 1983;72;After the Rehearsal;Drama;Josephson, Erland;Olin, Lena;Bergman, Ingmar;0;No;Bergman.png 1952;90;Big Jim McLain;Western;Wayne, John;Olson, Nancy;Ludwig, Edward;14;No;johnWayne.png 1969;101;Smith!;Western;Ford, Glenn;Olson, Nancy;O'Herlihy, Michael;62;No;glennFord.png 1953;79;Wild One, The;Drama;Brando, Marlon;O'Malley, Pat;Benedek, Laslo;26;No;brando.png 1929;129;Manxman, The;Drama;Brisson, Carl;Ondra, Anny;Hitchcock, Alfred;65;No;alfredHitchcock.png 1978;126;International Velvet;Drama;Hopkins, Anthony;O'Neal, Tatum;Forbes, Bryan;40;No;AnthonyHopkins.png 1981;104;Scanners;Horror;Lack, Stephen;O'Neill, Jennifer;Cronenberg, David;32;No;NicholasCage.png 1986;98;Trick or Treat;Horror;Price, Marc;Orgolini, Lisa;Smith, Charles Martin;47;No;NicholasCage.png 1982;92;48 Hrs.;Action;Nolte, Nick;O'Toole, Annette;Hill, Walter;67;No;NicholasCage.png 1985;108;Trip to Bountiful, The;Drama;Heard, John;Page, Geraldine;Masterson, Peter;62;Yes;NicholasCage.png 1955;116;Mister Roberts;Comedy;Fonda, Henry;Palmer, Betsy;Ford, John;8;Yes;johnFord.png 1969;127;Z;Drama;Montand, Yves;Papas, Irene;Costa-Gavras;72;Yes;NicholasCage.png 1987;139;Maurice;Drama;Wilby, James;Parfitt, Judy;Ivory, James;45;No;NicholasCage.png 1969;114;Hamlet;Drama;Williamson, Nicol;Parfitt, Judy;Richardson, Tony;39;No;NicholasCage.png 1991;117;La Femme Nikita;Drama;Karyo, Tcheky;Parillaud, Anne;Besson, Luc;6;No;NicholasCage.png 1993;95;Honeymoon in Vegas;Comedy;Caan, James;Parker, Sarah Jessica;Bergman, Andrew;53;No;NicholasCage.png 1988;90;Going for the Gold;Action;Edwards, Anthony;Parker, Sarah Jessica;Taylor, Dan;10;No;clintEastwood.png 1976;128;Shout at the Devil;Action;Marvin, Lee;Parkins, Barbara;Hunt, Peter R.;0;No;NicholasCage.png 1986;94;A Smoky Mountain Christmas;Music;Majors, Lee;Parton, Dolly;Winkler, Henry;23;No;NicholasCage.png 1984;95;Getting Physical;Drama;Naughton, David;Paul, Alexandra;Stern, Steven Hilliard;75;No;NicholasCage.png 1990;95;Torn Apart;Drama;Pasdar, Adrian;Peck, Cecilia;Fisher, Jack;8;No;NicholasCage.png 1986;112;From the Hip;Comedy;Nelson, Judd;Perkins, Elizabeth;Clark, Bob;36;No;NicholasCage.png 1984;102;Ratings Game, The;Comedy;DeVito, Danny;Perlman, Rhea;DeVito, Danny;21;No;NicholasCage.png 1992;100;Class Act;Drama;Reid, Christopher;Perlman, Rhea;;88;No;NicholasCage.png 1986;89;Water;Comedy;Caine, Michael;Perrine, Valerie;Clement, Dick;47;No;NicholasCage.png 1978;88;Silent Movie;Comedy;Brooks, Mel;Peters, Bernadette;Brooks, Mel;27;No;NicholasCage.png 1989;122;Pink Cadillac;Comedy;Eastwood, Clint;Peters, Bernadette;Eastwood, Clint;12;No;clintEastwood.png 1979;94;Jerk, The;Comedy;Martin, Steve;Peters, Bernadette;Reiner, Carl;22;No;NicholasCage.png 1980;180;Wild Times;;Elliott, Sam;Peyser, Penny;Compton, Richard;75;No;NicholasCage.png 1986;107;Sweet Liberty;Comedy;Alda, Alan;Pfeiffer, Michelle;Alda, Alan;12;No;MichellePfeiffer.png 1982;115;Grease II;Music;Caulfield, Maxwell;Pfeiffer, Michelle;Birch, Patricia;64;No;MichellePfeiffer.png 1989;104;Married to the Mob;Comedy;Modine, Matthew;Pfeiffer, Michelle;Demme, Jonathan;8;No;MichellePfeiffer.png 1985;121;Ladyhawke;Adventure;Broderick, Matthew;Pfeiffer, Michelle;Donner, Richard;68;No;MichellePfeiffer.png 1989;114;Fabulous Baker Boys, The;Drama;Bridges, Jeff;Pfeiffer, Michelle;Kloves, Steve;66;No;MichellePfeiffer.png 1985;115;Into the Night;Comedy;Goldblum, Jeff;Pfeiffer, Michelle;Landis, John;62;No;MichellePfeiffer.png 1991;124;Russia House, The;Drama;Connery, Sean;Pfeiffer, Michelle;Schepisi, Fred;3;No;MichellePfeiffer.png 1988;116;Tequila Sunrise;Mystery;Gibson, Mel;Pfeiffer, Michelle;Towne, Robert;50;No;MichellePfeiffer.png 1989;74;B. A. D. Cats;Action;Morrow, Vic;Pfeiffer, Michelle;;87;No;MichellePfeiffer.png 1971;108;Last Movie, The;Drama;Hopper, Dennis;Phillips, Michelle;Hopper, Dennis;22;No;NicholasCage.png 1973;106;Dillinger;Drama;Oates, Warren;Phillips, Michelle;Milius, John;83;No;NicholasCage.png 1988;360;Little Dorrit;Drama;Jacobi, Derek;Pickering, Sarah;Edzard, Christine;12;No;NicholasCage.png 1927;78;My Best Girl;Drama;Rogers, Charles;Pickford, Mary;Taylor, Sam;31;No;NicholasCage.png 1989;93;Seizure;Horror;Frid, Jonathan;Pickles, Christina;Stone, Oliver;59;No;NicholasCage.png 1990;89;A Chorus of Disapproval;Comedy;Irons, Jeremy;Pigg, Alexandra;Winner, Michael;0;No;NicholasCage.png 1962;119;Rome Adventure;Drama;Donahue, Tony;Pleshette, Suzanne;Daves, Delmer;39;No;NicholasCage.png 1992;121;Drowning by Numbers;Mystery;Hill, Bernard;Plowright, Joan;Greenaway, Peter;28;No;NicholasCage.png 1991;88;Born to Ride;Action;Stamos, John;Polo, Teri;Baker, Graham;59;No;NicholasCage.png 1988;94;Her Alibi;Comedy;Selleck, Tom;Porizkova, Paulina;Beresford, Bruce;80;No;NicholasCage.png 1988;96;Glitz;Mystery;Smits, Jimmy;Post, Markie;;9;No;NicholasCage.png 1990;95;Dangerous Pursuit;Mystery;Harrison, Gregory;Powers, Alexandra;Stern, Sandor;88;No;NicholasCage.png 1962;123;Experiment in Terror;Mystery;Ford, Glenn;Powers, Stefanie;Edwards, Blake;77;No;glennFord.png 1972;105;Hideaways, The;Comedy;Doran, Johnny;Prager, Sally;Cook, Fielder;42;No;NicholasCage.png 1965;108;What's New Pussycat;Comedy;O'Toole, Peter;Prentiss, Paula;Donner, Clive;83;No;NicholasCage.png 1965;108;What's New Pussycat?;Comedy;Sellers, Peter;Prentiss, Paula;Donner, Clive;46;No;NicholasCage.png 1983;98;Packin' It In;Comedy;Benjamin, Richard;Prentiss, Paula;Taylor, Jud;8;No;NicholasCage.png 1988;90;Naked Gun: From the Files of Police Squad!, THe;Comedy;Nielsen, Leslie;Presley, Priscilla;Zucker, David;9;No;NicholasCage.png 1990;106;In Too Deep;Drama;Race, Hugo;Press, Santha;Tatoulis, Colin South, John;50;No;NicholasCage.png 1988;107;Twins;Comedy;Schwarzenegger, Arnold;Preston, Kelly;Reitman, Ivan;23;No;NicholasCage.png 1988;94;Experts, The;Comedy;Travolta, John;Preston, Kelly;Thomas, Dave;67;No;NicholasCage.png 1989;94;Naked Lie;Drama;Lucking, William;Principal, Victoria;Colla, Richard A.;7;No;NicholasCage.png 1987;87;Mistress;Drama;Rachins, Allan;Principal, Victoria;Tuchner, Michael;36;No;NicholasCage.png 1992;;Pleasure Palace;Action;Sharif, Omar;Principal, Victoria;;45;No;NicholasCage.png 1970;100;Adam at 6 A.M.;Drama;Douglas, Michael;Purcell, Lee;Scheerer, Robert;3;No;NicholasCage.png 1990;93;Web of Deceit;Drama;Read, James;Purl, Linda;Stern, Sandor;6;No;NicholasCage.png 1991;119;New York Stories;Comedy;Allen, Woody;Questel, Mae;Coppola, Francis Ford;6;No;NicholasCage.png 1987;97;Dreams Lost, Dreams Found;Drama;Robb, David;Quinlan, Kathleen;Patterson, Willi;66;No;NicholasCage.png 1987;103;Au Revoir les Enfants;Drama;Manesse, Gaspard;Racette, Francine;Malle, Louis;35;No;NicholasCage.png 1989;122;Quo Vadis;Drama;Brandauer, Klaus Maria;Raines, Cristina;Rossi, Franco;6;No;NicholasCage.png 1949;100;Fighting Kentuckian, The;Action;Wayne, John;Ralston, Vera;Waggner, George;74;No;johnWayne.png 1974;104;Zardoz;Science Fiction;Connery, Sean;Rampling, Charlotte;Boorman, John;6;No;seanConnery.png 1989;84;Police Academy 6: City under Siege;Comedy;Smith, Bubba;Ramsey, Marion;Bonerz, Peter;29;No;NicholasCage.png 1988;90;Police Academy 5: Assignment Miami Beach;Comedy;Gaynes, George;Ramsey, Marion;Myerson, Alan;59;No;NicholasCage.png 1986;84;Police Academy 3: Back in Training;Comedy;Guttenberg, Steve;Ramsey, Marion;Paris, Jerry;6;No;NicholasCage.png 1991;60;America's Music, Blues;Music;Hopkins, Linda;Redd, Vi;Walton, Kip;54;No;NicholasCage.png 1977;100;Julia;Drama;Fonda, Vanessa;Redgrave, Jane;Zinnemann, Fred;75;Yes;NicholasCage.png 1971;111;Devils, The;Drama;Reed, Oliver;Redgrave, Vanessa;Russell, Ken;69;No;NicholasCage.png 1984;90;Ransom;Drama;Ford, Glenn;Reed, Donna;Segal, Alex;73;No;glennFord.png 1990;97;Cadillac Man;Comedy;Williams, Robin;Reed, Pamela;Donaldson, Roger;28;No;NicholasCage.png 1986;104;Best of Times, The;Comedy;Williams, Robin;Reed, Pamela;Spottiswoode, Roger;88;No;NicholasCage.png 1985;135;Death of a Salesman;Drama;Hoffman, Dustin;Reid, Kate;Schlöndorff, Volker;13;No;NicholasCage.png 1993;104;It Started with a Kiss;Drama;Ford, Glenn;Reynolds, Debbie;;80;No;glennFord.png 1989;88;Money, The;Drama;Luckinbill, Laurence;Richards, Elizabeth;;29;No;NicholasCage.png 1987;153;Empire of the Sun;Drama;Malkovich, John;Richardson, Miranda;Spielberg, Steven;6;No;NicholasCage.png 1991;102;Comfort of Strangers, The;Mystery;Walken, Christopher;Richardson, Natasha;Schrader, Paul;5;No;NicholasCage.png 1969;135;On Her Majesty's Secret Service;Action;Lazenby, George;Rigg, Diana;Hunt, Peter R.;66;No;NicholasCage.png 1986;96;Pretty in Pink;Drama;Stanton, Harry Dean;Ringwald, Molly;Deutch, Howard;75;No;NicholasCage.png 1987;90;PK. & the Kid.;Drama;LeMat, Paul;Ringwald, Molly;;49;No;NicholasCage.png 1943;60;Lone Star Trail, The;Western;Brown, Johnny Mack;Ritter, Tex;Taylor, Ray;27;No;NicholasCage.png 1986;98;Summer;Comedy;Gauthier, Vincent;Riviere, Marie;Rohmer, Eric;11;No;NicholasCage.png 1987;93;Planes, Trains & Automobiles;Comedy;Martin, Steve;Robbins, Laila;Hughes, John;73;No;NicholasCage.png 1990;119;Pretty Woman;Comedy;Gere, Richard;Roberts, Julia;Marshall, Garry;43;No;NicholasCage.png 1991;111;Flatliners;Drama;Sutherland, Kiefer;Roberts, Julia;Schumacher, Joel;19;No;NicholasCage.png 1991;142;Hook;Action;Williams, Robin;Roberts, Julia;Spielberg, Steven;4;No;NicholasCage.png 1940;56;Riders of Pasco Basin;Western;Brown, Johnny Mack;Robinson, Frances;Taylor, Ray;17;No;NicholasCage.png 1992;53;Gotta Dance, Gotta Sing;Music;Astaire, Fred;Rogers, Ginger;;20;No;NicholasCage.png 1990;106;Desperate Hours;Mystery;Rourke, Mickey;Rogers, Mimi;Cimino, Michael;58;No;NicholasCage.png 1986;111;Gung Ho;Comedy;Keaton, Michael;Rogers, Mimi;Howard, Ron;59;No;NicholasCage.png 1992;96;Shooting Elizabeth;Mystery;Goldblum, Jeff;Rogers, Mimi;Taylor, Baz;5;No;NicholasCage.png 1951;101;Strangers on a Train;Mystery;Granger, Farley;Roman, Ruth;Hitchcock, Alfred;17;No;alfredHitchcock.png 1979;198;Sacketts, The;Western;Elliott, Sam;Roman, Ruth;Totten, Robert;86;No;NicholasCage.png 1991;87;To Die Standing;Action;De Young, Cliff;Rose, Jamie;Morneau, Louis;53;No;NicholasCage.png 1980;92;Rodeo Girl;Drama;Hopkins, Bo;Ross, Katharine;Cooper, Jackie;80;No;NicholasCage.png 1969;110;Butch Cassidy & the Sundance Kid;Western;Newman, Paul;Ross, Katharine;Hill, George Roy;29;Yes;paulNewman.png 1968;121;Hellfighters;Action;Wayne, John;Ross, Katharine;McLaglen, Andrew V.;22;No;johnWayne.png 1980;92;Final Countdown, The;Action;Douglas, Kirk;Ross, Katharine;Taylor, Don;35;No;NicholasCage.png 1986;120;Blue Velvet;Mystery;MacLachlan, Kyle;Rossellini, Isabella;Lynch, David;6;No;lynch.png 1989;110;Cousins;Comedy;Danson, Ted;Rossellini, Isabella;;28;No;NicholasCage.png 1976;90;Black & White in Color;Comedy;Carmet, Jean;Rouvel, Catherine;Annaud, Jean-Jacques;24;Yes;NicholasCage.png 1988;81;Another Woman;Drama;Hackman, Gene;Rowlands, Gena;Allen, Woody;7;No;woody.png 1992;128;Night on Earth;Drama;Benigni, Roberto;Rowlands, Gena;Jarmusch, Jim;24;No;NicholasCage.png 1988;92;Permanent Record;Drama;Boyce, Alan;Rubin, Jennifer;Silver, Marisa;42;No;NicholasCage.png 1992;138;Fisher King, The;Drama;Williams, Robin;Ruehl, Mercedes;Gilliam, Terry;8;Yes;NicholasCage.png 1991;98;Another You;Comedy;Pryor, Richard;Ruehl, Mercedes;Phillips, Maurice;75;No;NicholasCage.png 1958;167;Young Lions, The;Drama;Brando, Marlon;Rush, Barbara;Dmytryk, Edward;10;No;NicholasCage.png 1988;89;Cheerleader Camp;Horror;Garrett, Leif;Russell, Betsy;Quinn, John;79;No;NicholasCage.png 1990;98;Trapper County War;Action;Hudson, Ernie;Russell, Betsy;;5;No;NicholasCage.png 1947;96;Angel & the Badman;Western;Wayne, John;Russell, Gail;Grant, James Edward;84;No;johnWayne.png 1990;109;Impulse;Mystery;Fahey, Jeff;Russell, Theresa;Locke, Sondra;23;No;NicholasCage.png 1988;91;Track Twenty-Nine;Drama;Oldman, Gary;Russell, Theresa;Roeg, Nicolas;48;No;NicholasCage.png 1991;110;Freejack;Action;Estevez, Emilio;Russo, Rene;Richardson, Tony;26;No;NicholasCage.png 1939;109;John Wayne Matinee Double Feature, No. 1;Western;Wayne, John;Rutherford, Ann;;30;No;johnWayne.png 1988;81;Smallest Show on Earth, The;Comedy;Sellers, Peter;Rutherford, Margaret;Dearden, Basil;24;No;NicholasCage.png 1987;120;Innerspace;Science Fiction;Quaid, Dennis;Ryan, Meg;Dante, Joe;41;No;NicholasCage.png 1988;97;Presidio, The;Action;Connery, Sean;Ryan, Meg;Hyams, Peter;4;No;seanConnery.png 1990;102;Joe Versus the Volcano;Comedy;Hanks, Tom;Ryan, Meg;Patrick, John;17;No;NicholasCage.png 1991;135;Doors, The;Drama;Kilmer, Val;Ryan, Meg;Stone, Oliver;60;No;NicholasCage.png 1990;98;Welcome Home, Roxy Carmichael;Comedy;Daniels, Jeff;Ryder, Winona;Abrahams, Jim;41;No;NicholasCage.png 1972;99;Cancel My Reservation;Comedy;Hope, Bob;Saint, Eva Marie;Bogart, Paul;60;No;NicholasCage.png 1991;135;North by Northwest;Mystery;Grant, Cary;Saint, Eva Marie;Hitchcock, Alfred;20;No;alfredHitchcock.png 1966;127;Russians Are Coming, the Russians Are, The;Comedy;Reiner, Carl;Saint, Eva Marie;Jewison, Norman;79;Yes;NicholasCage.png 1992;213;Exodus;Drama;Newman, Paul;Saint, Eva Marie;Preminger, Otto;13;No;paulNewman.png 1982;128;Ballad of Narayama, The;Drama;Ogata, Ken;Sakamoto, Sumiko;Imamura, Shohei;88;No;NicholasCage.png 1985;96;Out of the Darkness;Mystery;Sheen, Martin;Salt, Jennifer;Taylor, Jud;86;No;NicholasCage.png 1971;90;Garden of the Finzi-Continis, The;Drama;Capolicchio, Lino;Sanda, Dominique;De Sica, Vittorio;42;Yes;NicholasCage.png 1974;105;Steppenwolf;Drama;Sydow, Max von;Sanda, Dominique;Haines, Fred;20;No;NicholasCage.png 1973;100;Mackintosh Man, The;Action;Newman, Paul;Sanda, Dominique;Huston, John;65;No;paulNewman.png 1968;105;Partner;Drama;Clementi, Pierre;Sandrelli, Stefania;Bertolucci, Bernardo;26;No;NicholasCage.png 1970;107;Conformist, The;Drama;Trintignant, Jean-Louis;Sandrelli, Stefania;Bertolucci, Bernardo;72;No;NicholasCage.png 1971;102;Dirty Harry;Drama;Eastwood, Clint;Santoni, Reni;Siegel, Don;72;No;clintEastwood.png 1986;103;Ferris Bueller's Day Off;Comedy;Broderick, Matthew;Sara, Mia;Hughes, John;12;No;NicholasCage.png 1986;89;Legend;Science Fiction;Cruise, Tom;Sara, Mia;Scott, Ridley;42;No;NicholasCage.png 1984;110;Buddy System, The;Drama;Dreyfuss, Richard;Sarandon, Susan;Jordan, Glenn;48;No;NicholasCage.png 1989;97;A Dry White Season;Drama;Sutherland, Donald;Sarandon, Susan;Palcy, Euzhan;71;No;NicholasCage.png 1975;105;Rocky Horror Picture Show, The;Music;Gray, Charles;Sarandon, Susan;Sharman, Jim;59;No;NicholasCage.png 1968;360;War & Peace;Drama;Tikhonov, Vyacheslav;Savelyeva, Lyudmila;Bondarchuk, Sergei;80;Yes;NicholasCage.png 1992;96;Defense of the Realm;Drama;Elliott, Denholm;Scacchi, Greta;;79;No;NicholasCage.png 1991;90;Basil The Rat;Comedy;Cleese, John;Scales, Prunella;;9;No;NicholasCage.png 1979;90;Fawlty Towers, Gourmet Night, Waldorf Salad & The Kipper & the Corpse;Comedy;Cleese, John;Scales, Prunella;;46;No;NicholasCage.png 1991;80;Going Under;Comedy;Pullman, Bill;Schaal, Wendy;Travis, Mark W.;30;No;NicholasCage.png 1990;83;U S. Sub Standard.;Comedy;Pullman, Bill;Schaal, Wendy;;27;No;NicholasCage.png 1990;;Hells Angels on Wheels;Action;Nicholson, Jack;Scharf, Sabrina;;1;No;NicholasCage.png 1975;118;Passenger, The;Drama;Nicholson, Jack;Schneider, Maria;Antonioni, Michelangelo;32;No;JackNicholson.png 1973;127;Last Tango in Paris;Drama;Brando, Marlon;Schneider, Maria;Bertolucci, Bernardo;28;No;brando.png 1987;155;Indigo Autumn & Lilac Dream;Drama;Singer, Marc;Schrage, Lisa;Gillard, Stuart;72;No;NicholasCage.png 1924;95;Kriemhild's Revenge, The Nibelungenlied;Drama;Loos, Theodor;Schön, Margarete;Lang, Fritz;74;No;NicholasCage.png 1966;102;Johnny Tiger;Drama;Taylor, Robert;Scott, Brenda;Wendkos, Paul;69;No;NicholasCage.png 1986;90;Head Office;Comedy;Reinhold, Judge;Seymour, Jane;Finkleman, Ken;88;No;NicholasCage.png 1990;;Live & Let Die;Action;Moore, Roger;Seymour, Jane;;62;No;NicholasCage.png 1972;100;Le Charme Discret de la Bourgeoisie;Comedy;Rey, Fernando;Seyrig, Delphine;Bunuel, Luis;4;Yes;NicholasCage.png 1986;83;Blue City;Action;Nelson, Judd;Sheedy, Ally;Manning, Michelle;38;No;NicholasCage.png 1983;123;Bad Boys;Drama;Penn, Sean;Sheedy, Ally;Rosenthal, Rick;7;No;NicholasCage.png 1986;82;Whoopee Boys, The;Comedy;O'Keefe, Michael;Shelley, Carole;Byrum, John;54;No;NicholasCage.png 1971;118;Last Picture Show, The;Drama;Bottoms, Timothy;Shepherd, Cybill;Bogdanovich, Peter;62;Yes;NicholasCage.png 1988;93;Diamond Trap, The;Drama;Hessman, Howard;Shields, Brooke;Taylor, Don;58;No;NicholasCage.png 1981;115;Endless Love;Drama;Hewitt, Martin;Shields, Brooke;Zeffirelli, Franco;20;No;NicholasCage.png 1976;90;Rocky;Drama;Stallone, Sylvester;Shire, Talia;Avildsen, John G.;78;Yes;NicholasCage.png 1988;103;Cocktail;Drama;Cruise, Tom;Shue, Elisabeth;Donaldson, Roger;13;No;NicholasCage.png 1936;77;Sabotage;Mystery;Homolka, Oskar;Sidney, Sylvia;Hitchcock, Alfred;74;No;alfredHitchcock.png 1977;105;Madame Rosa;Drama;Youb, Samy Ben;Signoret, Simone;Mizrahi, Moshe;11;Yes;NicholasCage.png 1985;56;Fozzie's Muppet Scrapbook;Comedy;Berle, Milton;Sills, Beverly;;86;No;NicholasCage.png 1954;110;Desiree;Drama;Brando, Marlon;Simmons, Jean;Koster, Henry;22;No;brando.png 1960;185;Spartacus;Drama;Douglas, Kirk;Simmons, Jean;Kubrick, Stanley;67;Yes;NicholasCage.png 1955;150;Guys & Dolls;Comedy;Brando, Marlon;Simmons, Jean;Mankiewicz, Joseph L.;70;Yes;brando.png 1992;95;Until They Sail;Drama;Newman, Paul;Simmons, Jean;Wise, Robert;77;No;paulNewman.png 1988;116;Coming to America;Comedy;Murphy, Eddie;Sinclair, Madge;Landis, John;11;No;NicholasCage.png 1963;93;Lilies of the Field;Drama;Poitier, Sidney;Skala, Lilia;Poitier, Sidney;36;Yes;NicholasCage.png 1987;99;River's Edge;Drama;Glover, Crispin;Skye, Ione;Hunter, Tim;3;No;NicholasCage.png 1986;93;Ruthless People;Comedy;DeVito, Danny;Slater, Helen;Abrahams, Jim;84;No;NicholasCage.png 1987;110;Secret of My Success, The;Comedy;Fox, Michael J.;Slater, Helen;Ross, Herbert;5;No;NicholasCage.png 1965;128;Shop on Main Street, The;Drama;Kroner, Josef;Slivoka, Hana;Kadar, Jan;37;Yes;NicholasCage.png 1988;101;Funny Farm;Comedy;Chase, Chevy;Smith, Madolyn;Hill, George Roy;30;No;NicholasCage.png 1988;120;Lonely Passion of Judith Hearne, The;Drama;Hoskins, Bob;Smith, Maggie;Clayton, Jack;24;No;NicholasCage.png 1978;103;California Suite;Comedy;Caine, Michael;Smith, Maggie;Ross, Herbert;11;Yes;NicholasCage.png 1986;97;Maximum Overdrive;Horror;Estevez, Emilio;Smith, Yeardley;King, Stephen;40;No;NicholasCage.png 1985;116;Pale Rider;Western;Eastwood, Clint;Snodgress, Carrie;Eastwood, Clint;45;No;clintEastwood.png 1990;88;Kissing Place, The;Drama;Birney, Meredith Baxter;Snow, Victoria;Wharmby, Tony;41;No;NicholasCage.png 1986;90;French Lesson;Comedy;Sterling, Alexandre;Snowden, Jane;Gilbert, Brian;29;No;NicholasCage.png 1985;88;Roller Blade;Action;Hutchinson, Jeff;Solari, Suzanne;Jackson, Donald G;31;No;NicholasCage.png 1964;101;A Shot in the Dark;Comedy;Sellers, Peter;Sommer, Elke;Edwards, Blake;51;No;NicholasCage.png 1979;88;Treasure Seekers, The;Action;Whitman, Stuart;Sommer, Elke;;2;No;NicholasCage.png 1982;122;Missing;Drama;Lemmon, Jack;Spacek, Sissy;Costa-Gavras;30;No;NicholasCage.png 1989;99;Picasso Trigger;Action;Bond, Steve;Speir, Dona;Sidaris, Andy;20;No;NicholasCage.png 1987;97;Hard Ticket to Hawaii;Action;Moss, Ronn;Speir, Dona;Sidaris, Andy;36;No;NicholasCage.png 1990;;Diamonds are Forever;Action;Connery, Sean;St. John, Jill;Hamilton, Guy;8;No;seanConnery.png 1933;72;Baby Face;Drama;Brent, George;Stanwyck, Barbara;Green, Alfred E.;66;No;NicholasCage.png 1992;95;Violent Men, The;Action;Ford, Glenn;Stanwyck, Barbara;Mate, Rudolph;25;No;glennFord.png 1985;117;Cocoon;Science Fiction;Ameche, Don;Stapleton, Maureen;Howard, Ron;45;Yes;NicholasCage.png 1986;96;Clockwise;Comedy;Cleese, John;Steadman, Alison;Morahan, Christopher;10;No;NicholasCage.png 1993;103;Romantic Comedy;Comedy;Moore, Dudley;Steenburgen, Mary;;8;No;NicholasCage.png 1981;111;Outland;Science Fiction;Connery, Sean;Sternhagen, Frances;Hyams, Peter;7;No;seanConnery.png 1967;114;Hang 'em High;Western;Eastwood, Clint;Stevens, Inger;Post, Ted;67;No;clintEastwood.png 1992;123;Basic Instinct;Mystery;Douglas, Michael;Stone, Sharon;Verhoeven, Paul;41;No;NicholasCage.png 1990;113;Total Recall;Action;Schwarzenegger, Arnold;Stone, Sharon;Verhoeven, Paul;8;No;NicholasCage.png 1987;115;Stakeout;Comedy;Dreyfuss, Richard;Stowe, Madeleine;Badham, John;13;No;NicholasCage.png 1992;104;Unnamable II, The Statement of Randolph Carter, The;Drama;Rhys-Davies, John;Strain, Julie;Ouellette, Jean-Paul;36;No;NicholasCage.png 1967;85;Trip, The;Drama;Fonda, Peter;Strasberg, Susan;Corman, Roger;64;No;NicholasCage.png 1987;135;Ironweed;Drama;Nicholson, Jack;Streep, Meryl;Babenco, Hector;32;No;merylStreep.png 1979;;Kramer vs. Kramer;Drama;Hoffman, Dustin;Streep, Meryl;Benton, Robert;8;Yes;merylStreep.png 1988;;Still of the Night;Mystery;Scheider, Roy;Streep, Meryl;Benton, Robert;42;No;merylStreep.png 1991;112;Defending Your Life;Comedy;Brooks, Albert;Streep, Meryl;Brooks, Albert;75;No;merylStreep.png 1978;183;Deer Hunter, The;Drama;De Niro, Robert;Streep, Meryl;Cimino, Michael;82;Yes;merylStreep.png 1984;106;Falling in Love;Drama;De Niro, Robert;Streep, Meryl;Grosbard, Ulu;31;No;merylStreep.png 1986;108;Heartburn;Comedy;Nicholson, Jack;Streep, Meryl;Nichols, Mike;57;No;JackNicholson.png 1983;131;Silkwood;Drama;Russell, Kurt;Streep, Meryl;Nichols, Mike;52;No;merylStreep.png 1982;151;Sophie's Choice;Drama;Kline, Kevin;Streep, Meryl;Pakula, Alan J.;64;Yes;merylStreep.png 1985;161;Out of Africa;Drama;Redford, Robert;Streep, Meryl;Pollack, Sydney;88;Yes;merylStreep.png 1981;127;French Lieutenant's Woman, The;Drama;Irons, Jeremy;Streep, Meryl;Reisz, Karel;37;No;merylStreep.png 1985;124;Plenty;Drama;Dance, Charles;Streep, Meryl;Schepisi, Fred;9;No;merylStreep.png 1988;122;A Cry in the Dark;Drama;Neill, Sam;Streep, Meryl;Schepisi, Fred;67;No;merylStreep.png 1989;99;She-Devil;Comedy;Begley, Ed, Jr.;Streep, Meryl;Seidelman, Susan;43;No;merylStreep.png 1992;103;Death Becomes Her;Drama;Willis, Bruce;Streep, Meryl;Zemeckis, Robert;61;No;merylStreep.png 1991;28;Kids & Pesticides;Drama;Whyatt, Robin;Streep, Meryl;;36;No;merylStreep.png 1970;129;On a Clear Day You Can See Forever;Music;Montand, Yves;Streisand, Barbra;Minnelli, Vincente;67;No;NicholasCage.png 1987;100;Nuts;Drama;Dreyfuss, Richard;Streisand, Barbra;Ritt, Martin;52;No;NicholasCage.png 1983;134;Yentl;Music;Patinkin, Mandy;Streisand, Barbra;Streisand, Barbra;46;No;NicholasCage.png 1968;151;Funny Girl;Music;Sharif, Omar;Streisand, Barbra;Wyler, William;30;Yes;NicholasCage.png 1990;97;Fellow Traveller;Drama;Travanti, Daniel J.;Stubbs, Imogen;Towns, Philip Saville;39;No;NicholasCage.png 1970;140;Dodesukaden;Drama;Zushi, Yoshitaka;Sugai, Kin;Kurosawa, Akira;75;No;NicholasCage.png 1987;;Sicilian, The;Drama;Lambert, Christopher;Sukowa, Barbara;Cimino, Michael;41;No;NicholasCage.png 1941;117;So Ends Our Night;Drama;March, Fredric;Sullavan, Margaret;Cromwell, John;2;No;NicholasCage.png 1984;102;Sword of the Valiant;Action;O'Keeffe, Miles;Sutton, Emma;Weeks, Stephen;5;No;NicholasCage.png 1949;78;Devil's Wanton, The;Drama;Malmsten, Birger;Svedlund, Doris;Bergman, Ingmar;66;No;Bergman.png 1989;99;Driving Miss Daisy;Drama;Freeman, Morgan;Tandy, Jessica;Beresford, Bruce;6;Yes;NicholasCage.png 1991;111;Seventh Cross, The;Drama;Tracy, Spencer;Tandy, Jessica;;35;No;spencerTracy.png 1983;105;Between Friends;Drama;Ramer, Henry;Taylor, Elizabeth;Antonio, Lou;54;No;elizabethTaylor.png 1957;173;Raintree County;Drama;Clift, Montgomery;Taylor, Elizabeth;Dmytryk, Edward;74;No;elizabethTaylor.png 1975;101;Driver's Seat, The;Drama;Bannen, Ian;Taylor, Elizabeth;Griffi, Giuseppe Patroni;72;No;elizabethTaylor.png 1967;109;Reflections in a Golden Eye;Drama;Brando, Marlon;Taylor, Elizabeth;Huston, John;81;No;elizabethTaylor.png 1972;110;X, Y & Zee;Drama;Caine, Michael;Taylor, Elizabeth;Hutton, Brian G.;87;No;elizabethTaylor.png 1968;109;Secret Ceremony;Drama;Mitchum, Robert;Taylor, Elizabeth;Losey, Joseph;60;No;elizabethTaylor.png 1963;243;Cleopatra;Drama;Burton, Richard;Taylor, Elizabeth;Mankiewicz, Joseph L.;80;No;elizabethTaylor.png 1950;;Father of the Bride;Comedy;Taylor, Rod;Taylor, Elizabeth;Minnelli, Vincente;54;No;elizabethTaylor.png 1992;130;Who's Afraid of Virginia Woolf?;Drama;Burton, Richard;Taylor, Elizabeth;Nichols, Mike;82;Yes;elizabethTaylor.png 1977;110;A Little Night Music;Music;Cariou, Len;Taylor, Elizabeth;Prince, Harold;61;No;elizabethTaylor.png 1956;201;Giant;Drama;Hudson, Rock;Taylor, Elizabeth;Stevens, George;61;Yes;elizabethTaylor.png 1985;94;Rumor Mill, The;Drama;Dysart, Richard A.;Taylor, Elizabeth;Trikonis, Gus;62;No;elizabethTaylor.png 1943;90;Lassie Come Home;Drama;McDowall, Roddy;Taylor, Elizabeth;Wilcox, Fred M;79;No;elizabethTaylor.png 1993;76;Return Engagement;Drama;Bottoms, Joseph;Taylor, Elizabeth;;26;No;elizabethTaylor.png 1972;108;Hammersmith Is Out;Drama;Burton, Richard;Taylor, Elizabeth;;80;No;elizabethTaylor.png 1991;60;Super Duper Bloopers;Comedy;Cooper, Gary;Taylor, Elizabeth;;21;No;elizabethTaylor.png 1991;;Elizabeth Taylor Collection, The;Drama;Fisher, Eddie;Taylor, Elizabeth;;21;No;elizabethTaylor.png 1973;99;Ash Wednesday;Drama;Fonda, Henry;Taylor, Elizabeth;;54;No;elizabethTaylor.png 1991;117;Last Time I Saw Paris, The;Drama;Johnson, Van;Taylor, Elizabeth;;13;No;elizabethTaylor.png 1931;125;Cimarron;Western;Dix, Richard;Taylor, Estelle;Ruggles, Wesley;44;Yes;NicholasCage.png 1992;83;Apache Woman;Western;Bridges, Lloyd;Taylor, Joan;Corman, Roger;32;No;NicholasCage.png 1984;;Gary Numan - Berzerker;Music;Webb, John;Taylor, Karen;;60;No;NicholasCage.png 1988;101;Mystic Pizza;Comedy;Moses, William;Taylor, Lili;Petrie, Donald;74;No;NicholasCage.png 1991;95;Dogfight;Action;Phoenix, River;Taylor, Lili;Savoca, Nancy;66;No;NicholasCage.png 1935;234;Adventures of Rex & Rinty, The;Western;Rex the Wonder Horse;Taylor, Norma;Beebe, Ford;87;No;NicholasCage.png 1988;60;Daphnis & Chloe;Music;Morrow, Carl;Taylor, Victoria;Wimhurst, Jolyon;85;No;NicholasCage.png 1980;97;Marathon;Comedy;Newhart, Bob;Taylor-Young, Leigh;Cooper, Jackie;76;No;NicholasCage.png 1948;127;Fort Apache;Western;Fonda, Henry;Temple, Shirley;Ford, John;4;No;johnFord.png 1937;100;Wee Willie Winkie;Drama;Romero, Cesar;Temple, Shirley;Ford, John;78;No;johnFord.png 1987;91;Big Shots;Action;Busker, Ricky;Thayer, Brynn;Mandel, Robert;5;No;NicholasCage.png 1988;85;Doin' Time on Planet Earth;Comedy;Strouse, Nocholas;Thompson, Andrea;Matthau, Charles;44;Yes;NicholasCage.png 1983;91;All the Right Moves;Drama;Cruise, Tom;Thompson, Lea;Chapman, Michael;65;No;NicholasCage.png 1987;93;Some Kind of Wonderful;Drama;Stoltz, Eric;Thompson, Lea;Deutch, Howard;16;No;NicholasCage.png 1990;87;All New Tales from the Crypt, A Trilogy;Horror;Walsh, M. Emmet;Thompson, Lea;Deutch, Howard;33;No;NicholasCage.png 1985;116;Back to the Future;Comedy;Fox, Michael J.;Thompson, Lea;Zemeckis, Robert;9;No;NicholasCage.png 1963;80;Winter Light;Drama;Björnstrand, Gunnar;Thulin, Ingrid;Bergman, Ingmar;2;No;Bergman.png 1963;95;Silence, The;Drama;Malmsten, Birger;Thulin, Ingrid;Bergman, Ingmar;79;No;Bergman.png 1959;100;Magician, The;Drama;Sydow, Max von;Thulin, Ingrid;Bergman, Ingmar;3;No;Bergman.png 1961;154;Four Horsemen of the Apocalypse, The;Drama;Ford, Glenn;Thulin, Ingrid;Minnelli, Vincente;71;No;glennFord.png 1986;99;Critical Condition;Comedy;Pryor, Richard;Ticotin, Rachel;Apted, Michael;41;No;NicholasCage.png 1989;88;Center of the Web;Mystery;Curtis, Tony;Tilton, Charlene;;42;No;NicholasCage.png 1990;110;Border Shootout;Action;Ford, Glenn;Tilton, Charlene;;7;No;glennFord.png 1989;109;Lean on Me;Drama;Freeman, Morgan;Todd, Beverly;Avildsen, John G.;51;No;NicholasCage.png 1986;221;On Wings of Eagles;Drama;Lancaster, Burt;Towers, Constance;McLaglen, Andrew V.;53;No;burtLancaster.png 1941;94;Texas;Western;Holden, William;Trevor, Claire;Marshall, George;79;No;NicholasCage.png 1939;80;Allegheny Uprising;Drama;Wayne, John;Trevor, Claire;Seiter, William A.;53;No;johnWayne.png 1940;95;Dark Command;Western;Wayne, John;Trevor, Claire;Walsh, Raoul;52;No;johnWayne.png 1986;103;Peggy Sue Got Married;Drama;Cage, Nicolas;Turner, Kathleen;Coppola, Francis Ford;62;No;NicholasCage.png 1989;84;Dear America, Letters Home from Vietnam;War;De Niro, Robert;Turner, Kathleen;Couturie, Bill;57;No;NicholasCage.png 1985;130;Prizzi's Honor;Comedy;Nicholson, Jack;Turner, Kathleen;Huston, John;25;Yes;JackNicholson.png 1983;90;Man with Two Brains, The;Comedy;Martin, Steve;Turner, Kathleen;Reiner, Carl;68;No;NicholasCage.png 1984;101;Crimes of Passion;Drama;Perkins, Anthony;Turner, Kathleen;Russell, Ken;4;No;NicholasCage.png 1985;106;Jewel of the Nile, The;Action;Douglas, Michael;Turner, Kathleen;Teague, Lewis;68;No;NicholasCage.png 1984;106;Romancing the Stone;Action;Douglas, Michael;Turner, Kathleen;Zemeckis, Robert ;83;No;NicholasCage.png 1988;121;Accidental Tourist, The;Comedy;Hurt, William;Turner, Kathleen;;56;Yes;NicholasCage.png 1955;117;Sea Chase, The;War;Wayne, John;Turner, Lana;Farrow, John;4;No;johnWayne.png 1958;98;Another Time, Another Place;Drama;Connery, Sean;Turner, Lana;;4;No;seanConnery.png 1988;90;Cannibal Women in the Avocado Jungle of Death;Comedy;Primus, Barry;Tweed, Shannon;Lawton, J.F.;56;No;NicholasCage.png 1986;91;Mr Love.;Comedy;Jackson, Barry;Tyzack, Margaret;Battersby, Roy;10;No;NicholasCage.png 1968;139;2001: A Space Odyssey;Science Fiction;Dullea, Keir;Tyzack, Margaret;Kubrick, Stanley;83;No;NicholasCage.png 1966;81;Persona;Drama;Björnstrand, Gunnar;Ullman, Liv;Bergman, Ingmar;81;Yes;Bergman.png 1973;;Scenes from a Marriage;Drama;Josephson, Erland;Ullman, Liv;Bergman, Ingmar;3;Yes;Bergman.png 1968;88;Hour of the Wolf;Drama;Sydow, Max von;Ullman, Liv;Bergman, Ingmar;37;No;Bergman.png 1969;101;Passion of Anna, The;Drama;Sydow, Max von;Ullman, Liv;Bergman, Ingmar;6;No;Bergman.png 1984;96;Dangerous Moves;Drama;Caron, Leslie;Ullman, Liv;Dembo, Richard;7;Yes;NicholasCage.png 1957;147;Sayonara;Drama;Brando, Marlon;Umeki, Miyoshi;Logan, Joshua;19;Yes;brando.png 1968;158;Where Eagles Dare;War;Burton, Richard;Ure, Mary;Hulton, Brian G.;57;No;NicholasCage.png 1985;95;Teen Wolf;Drama;Fox, Michael J.;Ursitti, Susan;Daniel, Rod;58;No;NicholasCage.png 1990;88;Amazon;Action;Davi, Robert;Vaananen, Kari;Kaurismäki, Mika;30;No;NicholasCage.png 1973;;Paper Chase, The;Drama;Bottoms, Timothy;Wagner, Lindsay;Bridges, James;7;Yes;NicholasCage.png 1959;88;Virgin Spring, The;Drama;Sydow, Max von;Valberg, Brigitta;Bergman, Ingmar;8;Yes;Bergman.png 1970;97;Spider's Stratagem;Drama;Brogi, Giulio;Valli, Alida;Bertolucci, Bernardo;45;No;NicholasCage.png 1971;102;Play Misty for Me;Mystery;Eastwood, Clint;Walter, Jessica;Eastwood, Clint;47;No;clintEastwood.png 1981;88;Going Ape;Comedy;Danza, Tony;Walter, Jessica;Kronsberg, Jeremy Joe;65;No;NicholasCage.png 1967;127;Cool Hand Luke;Drama;Newman, Paul;Van Fleet, Jo;Rosenberg, Stuart;49;Yes;paulNewman.png 1988;89;Phantom of the Ritz;Horror;Bergman, Peter;Van Valkenburgh, Deborah;Plone, Allen;85;No;NicholasCage.png 1990;85;Crash & Burn;Science Fiction;Ganus, Paul;Ward, Megan;Band, Charles;75;No;NicholasCage.png 1991;114;After Dark My Sweet;Mystery;Patric, Jason;Ward, Rachel;Foley, James;33;No;NicholasCage.png 1992;121;Christopher Columbus: The Discovery;Adventure;Brando, Marlon;Ward, Rachel;Glen, John;39;No;NicholasCage.png 1986;109;Young Sherlock Holmes;Mystery;Rowe, Nicholas;Ward, Sophie;Levinson, Barry;16;No;NicholasCage.png 1991;104;Doc Hollywood;Comedy;Fox, Michael J.;Warner, Julie;Caton-Jones, Michael;64;No;NicholasCage.png 1988;96;Baja Oklahoma;Comedy;Coyote, Peter;Warren, Lesley Ann;Roth, Bobby;71;No;NicholasCage.png 1986;137;Aliens;Science Fiction;Biehn, Michael;Weaver, Sigourney;Cameron, James;82;No;weaver.png 1992;115;Alien Three;Science Fiction;Dutton, Charles;Weaver, Sigourney;Fincher, David;59;No;weaver.png 1997;109;Alien: resurrection;Science Fiction;Perlman, Ron;Weaver, Sigourney;Jeunet, Jean-Pierre;60;No;weaver.png 1979;117;Alien;Science Fiction;Skerritt, Tom;Weaver, Sigourney;Scott, Ridley;83;No;weaver.png 1985;97;One Woman or Two;Comedy;Depardieu, Gérard;Weaver, Sigourney;Vigne, Daniel;64;No;weaver.png 1984;96;Soggy Bottom U. S. A.;Comedy;Johnson, Ben;Wedgeworth, Ann;Flicker, Theodore J.;50;No;NicholasCage.png 1973;96;Bang the Drum Slowly;Drama;Moriarty, Michael;Wedgeworth, Ann;Hancock, John D.;73;No;NicholasCage.png 1974;82;Catamount Killing, The;Action;Buchholz, Horst;Wedgeworth, Ann;Zanussi, Krzystoff;84;No;NicholasCage.png 1972;92;Fuzz;Action;Reynolds, Burt;Welch, Raquel;Colla, Richard A.;37;No;NicholasCage.png 1966;101;Shoot Loud, Louder, I Don't Understand!;Mystery;Mastroianni, Marcello;Welch, Raquel;De Filippo, Eduardo;70;No;NicholasCage.png 1967;107;Bedazzled;Comedy;Cook, Peter;Welch, Raquel;Donen, Stanley;67;No;NicholasCage.png 1977;120;Prince & the Pauper, The;Action;Reed, Oliver;Welch, Raquel;Fleischer, Richard;86;No;NicholasCage.png 1969;110;One Hundred Rifles;Western;Reynolds, Burt;Welch, Raquel;Gries, Tom;48;No;NicholasCage.png 1975;90;Wild Party, The;Drama;Dukes, David;Welch, Raquel;Ivory, James;75;No;NicholasCage.png 1968;106;Bandolero!;Western;Stewart, James;Welch, Raquel;McLaglen, Andrew V.;9;No;NicholasCage.png 1973;119;Last of Sheila, The;Mystery;Coburn, James;Welch, Raquel;Ross, Herbert;39;No;NicholasCage.png 1972;87;Hannie Caulder;Drama;Borgnine, Ernest;Welch, Raquel;;9;No;NicholasCage.png 1990;;Sounds of the Seventies...& the Beat Goes;Music;Jones, Tom;Welch, Raquel;;13;No;NicholasCage.png 1988;161;Bird;Drama;Whitaker, Forest;Venora, Diane;Eastwood, Clint;24;No;NicholasCage.png 1955;60;Meet Millie;Drama;Halop, Florence;Verdugo, Elena;;82;No;NicholasCage.png 1987;88;Hell Comes to Frogtown;Science Fiction;LeFlore, Julius;Verrell, Cec;Jackson, Donald G;74;No;NicholasCage.png 1966;126;Fortune Cookie, The;Comedy;Lemmon, Jack;West, Judi;Wilder, Billy;3;Yes;NicholasCage.png 1990;92;Sun Shines Bright, The;Action;Winninger, Charles;Whelan, Arleen;Ford, John;46;No;johnFord.png 1987;106;Squeeze, The;Action;Keach, Stacy;White, Carol;Apted, Michael;23;No;NicholasCage.png 1970;91;Start the Revolution Without Me;Comedy;Wilder, Gene;Whitelaw, Billie;Yorkin, Bud;62;No;NicholasCage.png 1989;107;Major League;Comedy;Sheen, Charlie;Whitton, Margaret;Ward, David S.;64;No;NicholasCage.png 1990;108;Bright Lights, Big City;Drama;Fox, Michael J.;Wiest, Dianne;Bridges, James;30;No;NicholasCage.png 1987;97;Lost Boys, The;Horror;Patric, Jason;Wiest, Dianne;Schumacher, Joel;67;No;NicholasCage.png 1989;93;Cookie;Comedy;Falk, Peter;Wiest, Dianne;Seidelman, Susan;43;No;NicholasCage.png 1974;114;Conversation, The;Drama;Hackman, Gene;Williams, Cindy;Coppola, Francis Ford;59;Yes;NicholasCage.png 1973;112;American Graffiti;Comedy;Dreyfuss, Richard;Williams, Cindy;Lucas, George;39;Yes;NicholasCage.png 1953;96;Dangerous When Wet;Music;Lamas, Fernando;Williams, Esther;Walters, Charles;67;No;NicholasCage.png 1980;111;Stir Crazy;Comedy;Pryor, Richard;Williams, JoBeth;Poitier, Sidney;40;No;NicholasCage.png 1989;91;Young Einstein;Comedy;Serious, Yahoo;Wilson, Pee-Wee;Serious, Yahoo;47;No;NicholasCage.png 1956;83;Killing, The;Drama;Hayden, Sterling;Windsor, Marie;Kubrick, Stanley;51;No;NicholasCage.png 1973;102;Cahill, United States Marshal;Western;Wayne, John;Windsor, Marie;McLaglen, Andrew V.;12;No;johnWayne.png 1989;90;Savage Intruder, The;Horror;Garfield, John David;Wing, Virginia;Wolfe, Donald;24;No;NicholasCage.png 1992;139;Sheltering Sky, The;Drama;Malkovich, John;Winger, Debra;Bertolucci, Bernardo;64;No;NicholasCage.png 1982;125;An Officer & a Gentleman;Drama;Gere, Richard;Winger, Debra;Hackford, Taylor;1;Yes;NicholasCage.png 1987;101;Black Widow;Mystery;Hopper, Dennis;Winger, Debra;Rafelson, Bob;54;No;NicholasCage.png 1986;116;Legal Eagles;Comedy;Redford, Robert;Winger, Debra;Reitman, Ivan;39;No;NicholasCage.png 1970;90;Bloody Mama;Action;Stroud, Don;Winters, Shelley;Corman, Roger;17;No;NicholasCage.png 1965;106;A Patch of Blue;Drama;Poitier, Sidney;Winters, Shelley;Green, Guy;51;No;NicholasCage.png 1955;109;I Died a Thousand Times;Drama;Palance, Jack;Winters, Shelley;Heisler, Stuart;23;No;NicholasCage.png 1977;90;Tentacles;Horror;Huston, John;Winters, Shelley;Hellman, Oliver;62;No;NicholasCage.png 1968;100;Scalphunters, The;Western;Lancaster, Burt;Winters, Shelley;Pollack, Sydney;33;No;burtLancaster.png 1992;96;A Day in October;Drama;Sweeney, D. B.;Wolf, Kelly;Madsen, Kenneth;76;No;NicholasCage.png 1964;102;A Fistful of Dollars;Westerns;Eastwood, Clint;Volonte, Gian Maria;Leone, Sergio;61;No;clintEastwood.png 1985;94;My Science Project;Comedy;Stockwell, John;Von Zerneck, Danielle;Betnel, Jonathan;84;No;NicholasCage.png 1991;160;Great Race, The;Comedy;Moore, Dudley;Wood, Natalie;Edwards, Blake;88;No;NicholasCage.png 1956;119;Searchers, The;Western;Wayne, John;Wood, Natalie;Ford, John;9;No;johnWayne.png 1979;105;Meteor;Action;Connery, Sean;Wood, Natalie;Neame, Ronald;5;No;seanConnery.png 1955;111;Rebel Without a Cause;Drama;Dean, James;Wood, Natalie;Ray, Nicholas;82;No;NicholasCage.png 1961;153;West Side Story;Music;Beymer, Richard;Wood, Natalie;Wise, Robert;38;Yes;NicholasCage.png 1970;110;Trash;Comedy;Dallesandro, Joe;Woodlawn, Holly;Morrissey, Paul;68;No;NicholasCage.png 1966;95;A Big Hand for the Little Lady;Comedy;Fonda, Henry;Woodward, Joanne;Cook, Fielder;12;No;NicholasCage.png 1966;104;A Fine Madness;Comedy;Connery, Sean;Woodward, Joanne;Kershner, Irvin;6;No;seanConnery.png 1987;134;Glass Menagerie, The;Drama;Malkovich, John;Woodward, Joanne;Newman, Paul;68;No;NicholasCage.png 1989;117;Harry & Son;Drama;Newman, Paul;Woodward, Joanne;Newman, Paul;57;No;paulNewman.png 1968;102;Rachel, Rachel;Drama;Olson, James;Woodward, Joanne;Newman, Paul;32;No;NicholasCage.png 1961;98;Paris Blues;Drama;Newman, Paul;Woodward, Joanne;Ritt, Martin;54;No;paulNewman.png 1960;135;Fugitive Kind, The;Drama;Brando, Marlon;Woodward, Joanne;;3;No;brando.png 1993;;Mr. & Mrs. Bridge;Drama;Newman, Paul;Woodward, Joanne;;29;No;paulNewman.png 1991;144;State of Grace;Drama;Penn, Sean;Wright, Robin;Joanou, Phil;49;No;NicholasCage.png 1943;108;Shadow of a Doubt;Drama;Cotten, Joseph;Wright, Teresa;Hitchcock, Alfred;32;No;alfredHitchcock.png 1950;85;Men, The;Drama;Brando, Marlon;Wright, Teresa;Zinnemann, Fred;27;No;brando.png 1950;110;Stage Fright;Mystery;Wilding, Michael;Wyman, Jane;Hitchcock, Alfred;72;No;alfredHitchcock.png 1947;103;Magic Town;Drama;Stewart, James;Wyman, Jane;Wellman, William;4;No;NicholasCage.png 1975;93;That Lucky Touch;Action;Moore, Roger;York, Susannah;Miles, Christopher;85;No;NicholasCage.png 1949;90;Lust for Gold;Drama;Ford, Glenn;Young, Gig;Simon, S. Sylvan;57;No;glennFord.png 1987;103;Heat;Mystery;Reynolds, Burt;Young, Karen;Jameson, Jerry;69;No;NicholasCage.png 1993;75;Employee's Entrance;Drama;William, Warren;Young, Loretta;;0;No;NicholasCage.png 1947;87;Night Is My Future;Drama;Malmsten, Birger;Zetterling, Mai;Bergman, Ingmar;17;No;Bergman.png 1990;92;Witches, The;Science Fiction;Fisher, Jasen;Zetterling, Mai;Roeg, Nicolas;18;No;NicholasCage.png 1953;94;Vera Cruz;Action;Cooper, Gary;;Aldrich, Robert;71;No;NicholasCage.png 1954;91;Apache;Western;Lancaster, Burt;;Aldrich, Robert;78;No;burtLancaster.png 1977;146;Twilight's Last Gleaming;Drama;Lancaster, Burt;;Aldrich, Robert;84;No;burtLancaster.png 1979;119;Frisco Kid, The;Comedy;Wilder, Gene;;Aldrich, Robert;10;No;NicholasCage.png 1954;30;Bank on the Stars;Drama;Paar, Jack;;Allen, Craig;;No;NicholasCage.png 1987;100;Law of Desire;Drama;Maura, Carmen;;Almodóvar, Pedro;73;No;NicholasCage.png 1966;103;Quiller Memorandum, The;Mystery;Segal, George;;Anderson, Michael;34;No;NicholasCage.png 1962;183;Longest Day, The;War;Wayne, John;;Annakin, Ken;7;No;johnWayne.png 1986;128;Name of the Rose, The;Drama;Connery, Sean;;Annaud, Jean-Jacques;8;No;seanConnery.png 1988;92;Bloodsport;Action;Van Damme, Jean-Claude;;Arnold, Newt;78;No;NicholasCage.png 1986;85;Torment;Horror;Gilbert, Taylor;;Aslanian, Samson;8;No;NicholasCage.png 1988;138;Pelle the Conqueror;Drama;Sydow, Max von;;August, Bille;14;Yes;NicholasCage.png 1981;118;Taps;Drama;Hutton, Timothy;;Becker, Harold;84;No;NicholasCage.png 1991;102;Freshman, The;Comedy;Brando, Marlon;;Bergman, Andrew;32;No;brando.png 1987;164;Last Emperor, The;Drama;Lone, John;;Bertolucci, Bernardo;1;Yes;NicholasCage.png 1962;100;Grim Reaper, The;Drama;Rulu, Francesco;;Bertolucci, Bernardo;35;No;NicholasCage.png 1983;90;Le Dernier Combat;Drama;Jolivet, Pierre;;Besson, Luc;72;No;NicholasCage.png 1989;91;Too Beautiful for You;Drama;Depardieu, Gérard;;Blier, Bertrand;35;No;NicholasCage.png 1991;105;Fire, Ice & Dynamite;Action;Moore, Roger;;Bogner, Willy;72;No;NicholasCage.png 1963;113;Heavens Above;Comedy;Sellers, Peter;;Boulting, John;38;No;NicholasCage.png 1961;141;One Eyed Jacks;Western;Malden, Karl;;Brando, Marlon;26;No;brando.png 1937;61;Swing It, Sailor!;Comedy;Ford, Wallace;;Cannon, Raymond;83;No;NicholasCage.png 1987;94;Wolf at the Door, The;Drama;Sutherland, Donald;;Carlsen, Henning;68;No;NicholasCage.png 1936;87;Modern Times;Comedy;Chaplin, Charles;;Chaplin, Charles;4;No;NicholasCage.png 1991;114;Thunderbolt & Lightfoot;Action;Eastwood, Clint;;Cimino, Michael;16;No;clintEastwood.png 1931;87;A Nous la Liberte;Drama;Marchand, Henri;;Clair, Rene;60;No;NicholasCage.png 1979;95;Scum;Action;Winstone, Ray;;Clarke, Alan;68;No;NicholasCage.png 1984;90;Inside Man, The;Action;Hopper, Dennis;;Clegg, Tom;45;No;NicholasCage.png 1979;153;Apocalypse Now;Drama;Brando, Marlon;;Coppola, Francis Ford;8;No;brando.png 1990;94;Bellboy & the Playgirls, The;Drama;Wilkinson, June;;Coppola, Francis Ford;7;No;NicholasCage.png 1963;81;Terror, The;Horror;Karloff, Boris;;Corman, Roger;88;No;NicholasCage.png 1963;86;Raven, The;Horror;Price, Vincent;;Corman, Roger;85;No;NicholasCage.png 1975;87;They Came from Within;Horror;Hampton, Paul;;Cronenberg, David;21;No;NicholasCage.png 1986;97;Boy in Blue, The;Drama;Cage, Nicolas;;Dale, Cynthia;63;No;NicholasCage.png 1991;87;Killer Tomatoes Strike Back;Comedy;Astin, John;;De Bello, John;24;No;NicholasCage.png 1979;87;Attack of the Killer Tomatoes;Comedy;Wilson, George;;De Bello, John;47;No;NicholasCage.png 1987;119;Untouchables, The;Drama;Connery, Sean;;De Palma, Brian;7;Yes;seanConnery.png 1986;91;Wise Guys;Comedy;Piscopo, Joe;;De Palma, Brian;16;No;NicholasCage.png 1989;90;American Autobahn;Drama;Jalenak, Jan;;Degas, Andre;75;No;NicholasCage.png 1990;94;Final Alliance, The;Action;Hasselhoff, David;;Di Leo, Mario;10;No;NicholasCage.png 1984;130;Bounty, The;Drama;Gibson, Mel;;Donaldson, Roger;25;No;NicholasCage.png 1974;89;Little Prince, The;Music;Kiley, Richard;;Donen, Stanley;31;No;NicholasCage.png 1975;94;Posse;Western;Douglas, Kirk;;Douglas, Kirk;76;No;NicholasCage.png 1982;136;Firefox;Action;Eastwood, Clint;;Eastwood, Clint;64;No;clintEastwood.png 1987;91;Penitentiary III;Action;Kennedy, Leon Isaac;;Fanaka, Jamaa;82;No;NicholasCage.png 1993;;Ginger & Fred;Comedy;Mastroianni, Marcello;;Fellini, Federico;29;No;NicholasCage.png 1966;107;Wrong Box, The;Comedy;Mills, John;;Forbes, Bryan;40;No;NicholasCage.png 1990;86;Wagonmaster;Western;Johnson, Ben;;Ford, John;1;No;johnFord.png 1945;135;They Were Expendable;War;Montgomery, Robert;;Ford, John;88;No;johnFord.png 1991;125;Last Hurrah, The;Drama;Tracy, Spencer;;Ford, John;46;No;spencerTracy.png 1949;59;Law of the Golden West;Western;Hale, Monte;;Ford, Philip;1;No;NicholasCage.png 1949;60;Pioneer Marshal;Western;Hale, Monte;;Ford, Philip;8;No;NicholasCage.png 1949;60;Ranger of the Cherokee Strip;Western;Hale, Monte;;Ford, Philip;31;No;NicholasCage.png 1950;60;Vanishing Westerner;Western;Hale, Monte;;Ford, Philip;6;No;NicholasCage.png 1948;59;Bandits of Dark Canyon;Western;Lane, Allan;;Ford, Philip;72;No;NicholasCage.png 1948;60;Bold Frontiersman, The;Western;Lane, Allan;;Ford, Philip;18;No;NicholasCage.png 1948;59;Wild Frontier, The;Western;Lane, Allan;;Ford, Philip;61;No;NicholasCage.png 1968;73;Firemen's Ball, The;Comedy;Vostrcil, Jan;;Forman, Milos;8;No;NicholasCage.png 1983;112;Local Hero;Comedy;Riegert, Peter;;Forsyth, Bill;54;No;NicholasCage.png 1971;104;French Connection, The;Drama;Hackman, Gene;;Friedkin, William;88;Yes;NicholasCage.png 1985;114;To Live & Die in L. A.;Action;Stockwell, Dean;;Friedkin, William;70;No;NicholasCage.png 1961;113;Ferry to Hong Kong;Drama;Welles, Orson;;Gilbert, Lewis;77;No;NicholasCage.png 1983;69;Eddie Murphy, Delirious;Comedy;Murphy, Eddie;;Gower, Bruce;6;No;NicholasCage.png 1984;77;Secret Policeman's Private Parts, The;Comedy;Cleese, John;;Graef, Roger;36;No;NicholasCage.png 1958;83;Up the Creek;Comedy;Sellers, Peter;;Guest, Val;54;No;NicholasCage.png 1982;111;Yol;Drama;Akan, Tarik;;Guney, Yilmaz;53;No;NicholasCage.png 1989;150;Sara Dane;Drama;Hopkins, Harold;;Hardy, Rod;75;No;NicholasCage.png 1988;84;Night Tide;Drama;Muir, Gavin;;Harrington, Curtis;50;No;NicholasCage.png 1953;92;His Majesty O'Keefe;Action;Lancaster, Burt;;Haskin, Byron;3;No;burtLancaster.png 1960;122;North to Alaska;Western;Wayne, John;;Hathaway, Henry;31;No;johnWayne.png 1966;76;Flight to Fury;Action;Nicholson, Jack;;Hellman, Monte;70;No;NicholasCage.png 1966;82;Ride in the Whirlwind;Western;Nicholson, Jack;;Hellman, Monte;26;No;NicholasCage.png 1970;93;Powderkeg;Western;Taylor, Rod;;Heyes, Douglas;26;No;NicholasCage.png 1953;95;I Confess;Drama;Clift, Montgomery;;Hitchcock, Alfred;63;No;alfredHitchcock.png 1935;88;Thirty-Nine Steps, The;Science Fiction;Donat, Robert;;Hitchcock, Alfred;8;No;alfredHitchcock.png 1969;126;Topaz;Mystery;Forsythe, John;;Hitchcock, Alfred;12;No;alfredHitchcock.png 1930;95;Murder;Mystery;Marshall, Herbert;;Hitchcock, Alfred;50;No;alfredHitchcock.png 1954;123;Dial M for Murder;Mystery;Milland, Ray;;Hitchcock, Alfred;52;No;alfredHitchcock.png 1937;80;Young & Innocent;Mystery;Pilbeam, Nova;;Hitchcock, Alfred;43;No;alfredHitchcock.png 1976;95;Creature from Black Lake;Horror;Elam, Jack;;Houck, Joy, Jr.;88;No;NicholasCage.png 1981;124;Chariots of Fire;Drama;Cross, Ben;;Hudson, Hugh;6;Yes;NicholasCage.png 1982;81;Monty Python Live at the Hollywood Bowl;Comedy;Chapman, Graham;;Hughes, Terry;81;No;NicholasCage.png 1975;129;Man Who Would Be King, The;Drama;Connery, Sean;;Huston, John;6;No;seanConnery.png 1981;117;Victory;Drama;Stallone, Sylvester;;Huston, John;39;No;NicholasCage.png 1970;146;Kelly's Heroes;War;Eastwood, Clint;;Hutton, Brian G.;84;No;clintEastwood.png 1989;109;Next of Kin;Mystery;Swayze, Patrick;;Irvin, John;63;No;NicholasCage.png 1990;96;Chattahoochee;Drama;Oldman, Gary;;Jackson, Mick;30;No;NicholasCage.png 1985;82;Angelic Conversation, The;Comedy;Reynolds, Paul;;Jarman, Derek;41;No;NicholasCage.png 1986;107;Down by Law;Comedy;Waits, Tom;;Jarmusch, Jim;49;No;NicholasCage.png 1984;141;Killing Fields, The;Drama;Waterston, Sam;;Joffe, Roland;6;Yes;NicholasCage.png 1992;85;Survival Zone;Action;Ford, Terence;;Jones, Chris;25;No;NicholasCage.png 1979;94;Monty Python's Life of Brian;Comedy;Chapman, Graham;;Jones, Terry;11;No;NicholasCage.png 1983;107;Monty Python's the Meaning of Life;Comedy;Cleese, John;;Jones, Terry;33;No;NicholasCage.png 1971;121;Red Tent, The;Action;Finch, Peter;;Kalatozov, Mikhail;7;No;NicholasCage.png 1945;82;Dakota;Western;Wayne, John;;Kane, Joseph;27;No;johnWayne.png 1952;112;Viva Zapata!;Drama;Brando, Marlon;;Kazan, Elia;86;Yes;brando.png 1968;133;Green Berets, The;War;Wayne, John;;Kellogg, Ray;36;No;johnWayne.png 1990;90;Big Bad John;Action;English, Doug;;Kennedy, Burt;84;No;NicholasCage.png 1937;71;Ticket of Leave Man, The;Mystery;Slaughter, Tod;;King, George;45;No;NicholasCage.png 1956;106;D-Day, The Sixth of June;War;Taylor, Robert;;Koster, Henry;84;No;NicholasCage.png 1974;121;Apprenticeship of Duddy Kravitz, The;Drama;Dreyfuss, Richard;;Kotcheff, Ted;64;Yes;NicholasCage.png 1971;138;A Clockwork Orange;Science Fiction;McDowell, Malcolm;;Kubrick, Stanley;83;Yes;NicholasCage.png 1991;117;Full Metal Jacket;War;Modine, Matthew;;Kubrick, Stanley;45;No;NicholasCage.png 1943;82;Sanshiro Sugata;Drama;Fujita, Susumu;;Kurosawa, Akira;85;No;NicholasCage.png 1991;97;Rhapsody in August;Drama;Gere, Richard;;Kurosawa, Akira;50;No;NicholasCage.png 1946;110;No Regrets for Our Youth;Drama;Hara, Setsuko;;Kurosawa, Akira;31;No;NicholasCage.png 1960;152;Bad Sleep Well, The;Drama;Mifune, Toshiro;;Kurosawa, Akira;65;No;NicholasCage.png 1951;166;Idiot, The;Drama;Mifune, Toshiro;;Kurosawa, Akira;40;No;NicholasCage.png 1951;83;Rashomon;Drama;Mifune, Toshiro;;Kurosawa, Akira;59;Yes;NicholasCage.png 1962;96;Sanjuro;Mystery;Mifune, Toshiro;;Kurosawa, Akira;6;No;NicholasCage.png 1955;200;Seven Samurai;Drama;Mifune, Toshiro;;Kurosawa, Akira;9;No;NicholasCage.png 1957;110;Throne of Blood;Drama;Mifune, Toshiro;;Kurosawa, Akira;60;No;NicholasCage.png 1961;110;Yojimbo;Action;Mifune, Toshiro;;Kurosawa, Akira;60;No;NicholasCage.png 1980;161;Kagemusha;Drama;Nakadai, Tatsuya;;Kurosawa, Akira;74;Yes;NicholasCage.png 1952;134;Ikiru;Drama;Shimura, Takashi;;Kurosawa, Akira;36;No;NicholasCage.png 1987;90;Empire of Spiritual Ninja;Action;Berlin, Tom;;Lambert, Bruce;26;No;NicholasCage.png 1986;90;Ninja, the Violent Sorcerer;Action;;;Lambert, Bruce;;No;NicholasCage.png 1926;139;Metropolis;Science Fiction;Abel, Alfred;;Lang, Fritz;49;No;NicholasCage.png 1946;106;Cloak & Dagger;Mystery;Cooper, Gary;;Lang, Fritz;55;No;NicholasCage.png 1920;137;Spiders;Drama;De Vogy, Carl;;Lang, Fritz;29;No;NicholasCage.png 1954;90;Human Desire;Drama;Ford, Glenn;;Lang, Fritz;27;No;glennFord.png 1928;130;Spies;Drama;Klein-Rogge, Rudolf;;Lang, Fritz;49;No;NicholasCage.png 1933;120;Testament of Dr. Mabuse, The;Drama;Klein-Rogge, Rudolf;;Lang, Fritz;4;No;NicholasCage.png 1991;95;Fury;Drama;Tracy, Spencer;;Lang, Fritz;48;No;spencerTracy.png 1990;129;Mo' Better Blues;Drama;Washington, Denzel;;Lee, Spike;78;No;NicholasCage.png 1989;30;Matt Talbot;Drama;Ford, Seamus;;Lennon, Biddy W.;35;No;NicholasCage.png 1989;55;Will Rogers, Look Back in Laughter;Comedy;Williams, Robin;;Leo, Malcolm;6;No;NicholasCage.png 1991;130;For a Few Dollars More;Westerns;Eastwood, Clint;;Leone, Sergio;34;No;clintEastwood.png 1944;139;Thirty Seconds over Tokyo;War;Tracy, Spencer;;LeRoy, Mervyn;45;No;spencerTracy.png 1982;93;Class of 1984;Drama;King, Perry;;Lester, Mark L.;23;No;NicholasCage.png 1974;109;Juggernaut;Action;Harris, Richard;;Lester, Richard;63;No;NicholasCage.png 1987;120;Good Morning, Vietnam;Comedy;Williams, Robin;;Levinson, Barry;37;No;NicholasCage.png 1945;94;Blood on the Sun;Drama;Cagney, James;;Lloyd, Frank;76;No;NicholasCage.png 1969;161;Paint Your Wagon;Music;Marvin, Lee;;Logan, Joshua;46;No;NicholasCage.png 1964;105;Ensign Pulver;Comedy;Walker, Robert, Jr.;;Logan, Joshua;16;No;NicholasCage.png 1976;92;Street People;Action;Moore, Roger;;Lucidi, Maurizio;25;No;NicholasCage.png 1984;83;Manhunt, The;Action;Borgnine, Ernest;;Ludman, Larry;34;No;NicholasCage.png 1987;85;Operation Nam;War;Wayne, John Ethan;;Ludman, Larry;37;No;NicholasCage.png 1944;100;Fighting Seabees, The;War;Wayne, John;;Ludwig, Edward;35;No;johnWayne.png 1988;75;Let It Rock;Drama;Hopper, Dennis;;Lynch, David;32;No;lynch.png 1978;90;Eraserhead;Horror;Nance, John;;Lynch, David;2;No;lynch.png 1955;87;Ladykillers, The;Comedy;Guinness, Alec;;Mackendrick, Alexander;28;No;NicholasCage.png 1957;97;Sweet Smell of Success;Drama;Lancaster, Burt;;Mackendrick, Alexander;12;No;burtLancaster.png 1971;88;And Now for Something Completely Different;Comedy;Cleese, John;;MacNaughton, Ian;44;No;NicholasCage.png 1984;92;Crackers;Action;Sutherland, Donald;;Malle, Louis;17;No;NicholasCage.png 1991;89;Green Glove;Drama;Ford, Glenn;;Mate, Rudolph;54;No;glennFord.png 1970;89;Menace on the Mountain;Action;Crowley, Pat;;McEveety, Vincent;69;No;NicholasCage.png 1940;90;In Old California;Western;Wayne, John;;McGann, William;27;No;johnWayne.png 1967;85;Thirty Is a Dangerous Age, Cynthia;Comedy;Moore, Dudley;;McGrath, Joseph;28;No;NicholasCage.png 1980;99;Ffolkes;Action;Moore, Roger;;McLaglen, Andrew V.;62;No;NicholasCage.png 1970;111;Chisum;Western;Wayne, John;;McLaglen, Andrew V.;72;No;johnWayne.png 1990;135;Hunt for Red October, The;Drama;Connery, Sean;;McTiernan, John;8;No;seanConnery.png 1966;123;Closely Watched Trains;Drama;Neckar, Vaclav;;Menzel, Jiri;75;Yes;NicholasCage.png 1973;91;Executive Action;Drama;Lancaster, Burt;;Miller, David;6;No;burtLancaster.png 1942;101;Flying Tigers;Action;Wayne, John;;Miller, David;61;No;johnWayne.png 1991;87;Father's Little Dividend;Comedy;Tracy, Spencer;;Minnelli, Vincente;52;No;spencerTracy.png 1982;92;An Evening with Robin Williams;Comedy;Williams, Robin;;Mischer, Don;68;No;NicholasCage.png 1987;90;Eddie Murphy Raw;Comedy;Murphy, Eddie;;Murphy, Eddie;51;No;NicholasCage.png 1989;118;Harlem Nights;Comedy;Murphy, Eddie;;Murphy, Eddie;11;No;NicholasCage.png 1973;93;Santee;Western;Ford, Glenn;;Nelson, Gary;47;No;glennFord.png 1987;90;Good Father, The;Drama;Hopkins, Anthony;;Newell, Mike;42;No;AnthonyHopkins.png 1971;115;Sometimes a Great Notion;Drama;Newman, Paul;;Newman, Paul;7;No;paulNewman.png 1970;117;Catch Twenty-Two;Comedy;Arkin, Alan;;Nichols, Mike;50;No;NicholasCage.png 1988;90;Dark Age;Action;Jarratt, John;;Nicholson, Arch;3;No;NicholasCage.png 1981;94;Deadline;Mystery;Newman, Barry;;Nicholson, Arch;9;No;paulNewman.png 1935;60;Mysterious Mr. Wong;Mystery;Lugosi, Bela;;Nigh, William;71;No;NicholasCage.png 1988;92;A Month in the Country;Drama;Firth, Colin;;O'Connor, Pat;57;No;NicholasCage.png 1990;97;Prom Night III, The Last Kiss;Horror;Conlon, Tim;;Oliver, Ron;29;No;NicholasCage.png 1990;;Blood in, Blood Out;Drama;Penn, Sean;;Olmos, Edward James;88;No;NicholasCage.png 1989;94;Wrong Arm of the Law, The;Comedy;Sellers, Peter;;Owen, Cliff;25;No;NicholasCage.png 1987;116;Orphans;Drama;Finney, Albert;;Pakula, Alan J.;21;No;NicholasCage.png 1976;139;All the President's Men;Drama;Redford, Robert;;Pakula, Alan J.;45;Yes;NicholasCage.png 1987;73;J-Men Forever;Action;Bergman, Peter;;Patterson, Richard;59;No;NicholasCage.png 1969;144;Wild Bunch, The;Western;Holden, William;;Peckinpah, Sam;50;No;NicholasCage.png 1988;92;Judgement in Berlin;Drama;Sheen, Martin;;Penn, Leo;13;No;NicholasCage.png 1993;;Hot Line, The;Comedy;Boyer, Charles;;Perier, Etienne;70;No;NicholasCage.png 1988;100;Rocket Gibraltar;Drama;Lancaster, Burt;;Petrie, Daniel;26;No;burtLancaster.png 1975;112;Yakuza, The;Action;Mitchum, Robert;;Pollack, Sydney;16;No;NicholasCage.png 1972;116;Jeremiah Johnson;Drama;Redford, Robert;;Pollack, Sydney;88;No;NicholasCage.png 1970;112;Burn!;Drama;Brando, Marlon;;Pontecorvo, Gillo;75;No;brando.png 1973;122;Magnum Force;Action;Eastwood, Clint;;Post, Ted;28;No;clintEastwood.png 1989;86;Cyborg;Action;Van Damme, Jean-Claude;;Pyun, Albert;31;No;NicholasCage.png 1979;108;Prisoner of Zenda, The;Comedy;Sellers, Peter;;Quine, Richard;12;No;NicholasCage.png 1983;86;Scream;Horror;Martin, Pepper;;Quisenberry, Byron;24;No;NicholasCage.png 1986;140;Assault, The;Drama;Lint, Derek De;;Rademakers, Fons;71;Yes;NicholasCage.png 1951;102;Flying Leathernecks;Action;Wayne, John;;Ray, Nicholas;23;No;johnWayne.png 1985;92;What Comes Around;Drama;Reed, Jerry;;Reed, Jerry;49;No;NicholasCage.png 1980;123;Mon Oncle D'Amerique;Comedy;Roger-Pierre;;Resnais, Alain;71;No;NicholasCage.png 1972;92;Culpepper Cattle Company, The;Western;Grimes, Gary;;Richards, Dick;29;No;NicholasCage.png 1983;102;Survivors, The;Comedy;Matthau, Walter;;Ritchie, Michael;52;No;NicholasCage.png 1984;96;Roadhouse Sixty-Six;Action;Dafoe, Willem;;Robinson, John Mark;20;No;NicholasCage.png 1991;60;Burning Poles, Cecil Taylor in Performance;Music;Taylor, Cecil;;Rochlin, Sheldon;82;No;NicholasCage.png 1987;98;Russkies;Action;Hubley, Whip;;Rosenthal, Rick;87;No;NicholasCage.png 1990;96;My Blue Heaven;Comedy;Martin, Steve;;Ross, Herbert;63;No;NicholasCage.png 1990;103;Altered States;Science Fiction;Hurt, William;;Russell, Ken;22;No;NicholasCage.png 1972;128;Cowboys, The;Western;Wayne, John;;Rydell, Mark;58;No;johnWayne.png 1985;95;Code Name, Emerald;Drama;Harris, Ed;;Sanger, Jonathan;22;No;NicholasCage.png 1970;170;Patton;War;Scott, George C.;;Schaffner, Franklin J.;8;Yes;NicholasCage.png 1969;123;Midnight Cowboy;Drama;Hoffman, Dustin;;Schlesinger, John;33;Yes;NicholasCage.png 1985;131;Falcon & the Snowman, The;Drama;Hutton, Timothy;;Schlesinger, John;61;No;NicholasCage.png 1976;112;Maitresse;Drama;Ogier, Bulle;;Schroeder, Barbet;39;No;NicholasCage.png 1987;86;Disorderlies;Comedy;Boys, The Fat;;Schultz, Michael;69;No;NicholasCage.png 1991;;Raging Bull;Drama;De Niro, Robert;;Scorsese, Martin;25;No;NicholasCage.png 1991;60;Garrison Keillor's Home;Comedy;Keillor, Garrison;;Sevush, Herb;6;No;NicholasCage.png 1938;55;Overland Stage Raiders;Western;Wayne, John;;Sherman, George;83;No;johnWayne.png 1938;55;Pals of the Saddle;Western;Wayne, John;;Sherman, George;33;No;johnWayne.png 1982;92;Alone in the Dark;Horror;Schultz, Dwight;;Sholder, Jack;75;No;NicholasCage.png 1971;109;Beguiled, The;Drama;Eastwood, Clint;;Siegel, Don;60;No;clintEastwood.png 1979;112;Escape from Alcatraz;Drama;Eastwood, Clint;;Siegel, Don;22;No;clintEastwood.png 1948;88;Criss Cross;Drama;Lancaster, Burt;;Siodmak, Robert;77;No;burtLancaster.png 1976;132;Midway;War;Heston, Charlton;;Smight, Jack;36;No;NicholasCage.png 1990;126;Indiana Jones & the Last Crusade;Action;Ford, Harrison;;Spielberg, Steven;8;No;NicholasCage.png 1993;90;Duel;Mystery;Weaver, Dennis;;Spielberg, Steven;48;No;NicholasCage.png 1991;193;Separate but Equal;Drama;Poitier, Sidney;;Stevens, George, Jr.;56;No;NicholasCage.png 1924;123;Gosta Berling's Saga;Drama;Hanson, Lars;;Stiller, Mauritz;63;No;NicholasCage.png 1986;120;Platoon;Drama;Sheen, Charlie;;Stone, Oliver;8;Yes;NicholasCage.png 1963;89;Crawling Hand, The;Science Fiction;Breck, Peter;;Strock, Herbert L.;79;No;NicholasCage.png 1971;100;Willy Wonka & the Chocolate Factory;Music;Wilder, Gene;;Stuart, Mel;65;No;NicholasCage.png 1971;88;Joe Kidd;Western;Eastwood, Clint;;Sturges, John;79;No;clintEastwood.png 1985;104;Santa Claus, The Movie;Comedy;Moore, Dudley;;Szwarc, Jeannot;19;No;NicholasCage.png 1938;96;Boys Town;Drama;Tracy, Spencer;;Taurog, Norman;21;Yes;spencerTracy.png 1990;59;Erasure, Live Wild!;Music;;;Taylor, Gavin;48;No;NicholasCage.png 1982;150;A Question of Honor;Drama;Gazzara, Ben;;Taylor, Jud;80;No;NicholasCage.png 1947;61;Check Your Guns;Western;Dean, Eddie;;Taylor, Ray;80;No;NicholasCage.png 1947;56;West to Glory;Western;Dean, Eddie;;Taylor, Ray;43;No;NicholasCage.png 1937;60;Throwback, The;Western;Jones, Buck;;Taylor, Ray;53;No;NicholasCage.png 1992;54;Border Feud;Action;LaRue, Lash;;Taylor, Ray;43;No;NicholasCage.png 1947;58;Fighting Vigilantes, The;Western;LaRue, Lash;;Taylor, Ray;21;No;NicholasCage.png 1947;53;Law of the Lash;Western;LaRue, Lash;;Taylor, Ray;66;No;NicholasCage.png 1949;66;Outlaw Country;Western;LaRue, Lash;;Taylor, Ray;62;No;NicholasCage.png 1992;53;Return of the Lash;Action;LaRue, Lash;;Taylor, Ray;78;No;NicholasCage.png 1937;60;Mystery of the Hooded Horsemen;Western;Ritter, Tex;;Taylor, Ray;52;No;NicholasCage.png 1937;60;Tex Rides with the Boy Scouts;Western;Ritter, Tex;;Taylor, Ray;17;No;NicholasCage.png 1949;59;Shadows of the West;Western;Wilson, Whip;;Taylor, Ray;40;No;NicholasCage.png 1991;102;Instant Karma;Comedy;Cassidy, David;;Taylor, Roderick;47;No;NicholasCage.png 1957;73;Time Lock;Drama;Connery, Sean;;Thomas, Gerald;5;No;seanConnery.png 1953;79;Appointment in Honduras;Drama;Ford, Glenn;;Tourneur, Jacques;7;No;glennFord.png 1982;136;Danton;Drama;Depardieu, Gérard;;Wajda, Andrzej;5;No;NicholasCage.png 1960;164;Alamo, The;Action;Wayne, John;;Wayne, John;29;No;johnWayne.png 1986;91;La Chevre, (The Goat);Drama;Depardieu, Gérard;;Veber, Francis;24;No;NicholasCage.png 1985;109;Les Comperes;Comedy;Richard, Pierre;;Veber, Francis;54;No;NicholasCage.png 1990;128;Dead Poets Society;Drama;Williams, Robin;;Weir, Peter;8;Yes;NicholasCage.png 1952;93;Othello, The Lost Masterpiece;Drama;Welles, Orson;;Welles, Orson;23;No;NicholasCage.png 1949;119;Battleground, The;War;Johnson, Van;;Wellman, William;7;No;NicholasCage.png 1976;176;Kings of the Road (In the Course of Time);Drama;Vogler, Rudiger;;Wenders, Wim;41;No;NicholasCage.png 1990;98;Hiroshima;Drama;Nelson, Judd;;Werner, Peter;17;No;NicholasCage.png 1982;111;Return of Martin Guerre, The;Drama;Depardieu, Gérard;;Vigne, Daniel;51;No;NicholasCage.png 1956;97;Somebody up There Likes Me;Drama;Newman, Paul;;Wise, Robert;56;No;paulNewman.png 1955;57;Jack Benny Show;Comedy;Benny, Jack;;;51;No;NicholasCage.png 1962;182;Mutiny on the Bounty;Action;Brando, Marlon;;;35;No;brando.png 1989;;Death Valley Days, Deadly Decision;Western;Caan, James;;;9;No;NicholasCage.png 1986;60;Monty Python's Flying Circus;Comedy;Chapman, Graham;;;4;No;NicholasCage.png 1986;60;Monty Python's Flying Circus, Vol 1.;Comedy;Chapman, Graham;;;24;No;NicholasCage.png 1986;59;Monty Python's Flying Circus, Vol 2.;Comedy;Chapman, Graham;;;79;No;NicholasCage.png 1986;58;Monty Python's Flying Circus, Vol 3.;Comedy;Chapman, Graham;;;63;No;NicholasCage.png 1990;;Valkenvania;Comedy;Chase, Chevy;;;82;No;NicholasCage.png 1982;101;Secret Policeman's Other Ball, The;Comedy;Cleese, John;;;86;No;NicholasCage.png 1981;127;Taming of the Shrew, The;Drama;Cleese, John;;;2;No;NicholasCage.png 1964;;From Russia with Love;Action;Connery, Sean;;;6;No;seanConnery.png 1993;108;Offence, The;Mystery;Connery, Sean;;;6;No;seanConnery.png 1992;60;Hollywood Mavericks;Comedy;Coppola, Francis Ford;;;22;No;NicholasCage.png 1990;60;Live at Harrah's;Comedy;Cosby, Bill;;;6;No;NicholasCage.png 1992;52;Persuaders, The Overture, The;Mystery;Curtis, Tony;;;40;No;NicholasCage.png 1977;255;Nineteen Hundred;Drama;De Niro, Robert;;;82;No;NicholasCage.png 1989;90;Van, The;Comedy;DeVito, Danny;;;5;No;NicholasCage.png 1972;15;My Country Right or Wrong;War;Douglas, Michael;;;21;No;NicholasCage.png 1991;;Clint Eastwood Collection, The;Westerns;Eastwood, Clint;;;11;No;clintEastwood.png 1991;;Complete Dirty Harry, Magnum Force, The;Action;Eastwood, Clint;;;53;No;clintEastwood.png 1992;92;Dead Pool, The;Action;Eastwood, Clint;;;26;No;clintEastwood.png 1992;163;Good, the Bad & the Ugly, The;Westerns;Eastwood, Clint;;;68;No;clintEastwood.png 1959;60;Rawhide, Premiere Episode;Western;Eastwood, Clint;;;54;No;clintEastwood.png 1992;118;Tightrope;Mystery;Eastwood, Clint;;;55;No;clintEastwood.png 1987;95;Hearts of Fire;Drama;Everett, Rupert;;;25;No;NicholasCage.png 1992;165;How the West Was Won;Western;Fonda, Henry;;;45;No;NicholasCage.png 1992;;Mummy's Hand, The;Mystery;Foran, Dick;;;54;No;NicholasCage.png 1993;88;Great White Death;Action;Ford, Glenn;;;26;No;glennFord.png 1986;119;Mosquito Coast, The;Drama;Ford, Harrison;;;54;No;NicholasCage.png 1993;102;Today We Kill....Tomorrow We Die;Western;Ford, Montgomery;;;25;No;NicholasCage.png 1991;;Tormenta Sobre Arizona;Drama;Ford, Wallace;;;81;No;NicholasCage.png 1989;116;Back to the Future II;Comedy;Fox, Michael J.;;;65;No;NicholasCage.png 1959;60;Maverick, Duel at Sundown;Western;Garner, James;;;26;No;NicholasCage.png 1983;;Shakespeare Series;Drama;Gielgud, John;;;23;No;NicholasCage.png 1973;105;Deadly Trackers;Western;Harris, Richard;;;54;No;NicholasCage.png 1992;72;American Film Institute, Alfred Hitchcock;Mystery;Hitchcock, Alfred;;;70;No;NicholasCage.png 1990;;A Married Man;Drama;Hopkins, Anthony;;;79;No;AnthonyHopkins.png 1982;208;Othello;Drama;Hopkins, Anthony;;;84;No;AnthonyHopkins.png 1975;85;Only Way Home, The;Drama;Hopkins, Bo;;;60;No;NicholasCage.png 1953;120;Tales of Tomorrow;Horror;Karloff, Boris;;;0;No;NicholasCage.png 1991;128;Inherit the Wind;Drama;Kelly, Gene;;;18;No;NicholasCage.png 1990;45;This Is Horror;Horror;King, Stephen;;;3;No;NicholasCage.png 1992;112;Conversation Piece;Drama;Lancaster, Burt;;;1;No;burtLancaster.png 1992;105;Crimson Pirate, The;Action;Lancaster, Burt;;;60;No;burtLancaster.png 1992;83;Devil's Disciple, The;Mystery;Lancaster, Burt;;;65;No;burtLancaster.png 1992;166;Hallelujah Trail, The;Drama;Lancaster, Burt;;;6;No;burtLancaster.png 1992;133;Train, The;Action;Lancaster, Burt;;;68;No;burtLancaster.png 1986;49;Jay Leno: The American Dream;Comedy;Leno, Jay;;;67;No;NicholasCage.png 1990;92;Primal Rage;Mystery;Lowe, Patrick;;;3;No;NicholasCage.png 1990;50;Industrial Symphony, The Dream of the Broken-Hearted;Music;Lynch, David;;;49;No;lynch.png 1986;52;Howie Mandel's North American Watusi Tour;Comedy;Mandel, Howie;;;65;No;NicholasCage.png 1989;90;Branford Marsalis, Steep;Music;Marsalis, Branford;;;52;No;NicholasCage.png 1991;98;L. A. Story;Comedy;Martin, Steve;;;81;No;NicholasCage.png 1986;60;Steve Martin Live!;Comedy;Martin, Steve;;;3;No;NicholasCage.png 1974;60;Steve Martin, The Funnier Side of Eastern Canada;Comedy;Martin, Steve;;;34;No;NicholasCage.png 1993;;Runaway Barge, The;Action;Matheson, Tim;;;38;No;NicholasCage.png 1992;101;Romulus & the Sabines;Action;Moore, Roger;;;76;No;NicholasCage.png 1989;;Saint, The;Mystery;Moore, Roger;;;29;No;NicholasCage.png 1983;91;Strange Brew;Comedy;Moranis, Rick;;;24;No;NicholasCage.png 1990;98;Another Forty-Eight Hours;Action;Murphy, Eddie;;;54;No;NicholasCage.png 1989;;Best of Eddie Murphy, Saturday Night Live, The;Comedy;Murphy, Eddie;;;56;No;NicholasCage.png 1991;99;What about Bob?;Comedy;Murray, Bill;;;6;No;NicholasCage.png 1953;91;Mummy's Revenge, The;Horror;Naschy, Paul;;;56;No;NicholasCage.png 1992;121;Harper;Mystery;Newman, Paul;;;86;No;paulNewman.png 1992;102;Left Handed Gun, The;Western;Newman, Paul;;;26;No;paulNewman.png 1989;;Once upon a Wheel;Action;Newman, Paul;;;40;No;paulNewman.png 1992;136;Prize, The;Drama;Newman, Paul;;;66;No;paulNewman.png 1968;;Secret War of Harry Frigg, The;Comedy;Newman, Paul;;;28;No;paulNewman.png 1990;;Two Jakes, The;Mystery;Nicholson, Jack;;;3;No;NicholasCage.png 1989;61;Exile in Concert;Music;Pennington, J. P.;;;12;No;NicholasCage.png 1987;60;Joe Piscopo New Jersey Special;Comedy;Piscopo, Joe;;;14;No;NicholasCage.png 1991;60;Joe Piscopo Video, The;Comedy;Piscopo, Joe;;;44;No;NicholasCage.png 1989;;Death Valley Days, No Gun Behind His Badge;Western;Reagan, Ronald;;;1;No;NicholasCage.png 1988;96;Salsa: The Motion Picture;Drama;Rosa, Robby;;;26;No;NicholasCage.png 1991;80;Hollywood's Greatest War Movies;War;Scott, George C.;;;41;No;NicholasCage.png 1991;91;Out for Justice;Action;Seagal, Steven;;;2;No;NicholasCage.png 1956;27;Case of the Mukkinese Battle Horn, The;Comedy;Sellers, Peter;;;45;No;NicholasCage.png 1953;75;Goon Show Movie, The;Comedy;Sellers, Peter;;;80;No;NicholasCage.png 1975;95;Great McGonagall, The;Comedy;Sellers, Peter;;;72;No;NicholasCage.png 1991;101;I'm All Right Jack;Comedy;Sellers, Peter;;;23;No;NicholasCage.png 1991;101;Magic Christian, The;Comedy;Sellers, Peter;;;75;No;NicholasCage.png 1960;91;Never Let Go;Action;Sellers, Peter;;;5;No;NicholasCage.png 1991;121;Pink Panther, The;Comedy;Sellers, Peter;;;77;No;NicholasCage.png 1991;84;Two-Way Stretch;Comedy;Sellers, Peter;;;7;No;NicholasCage.png 1988;65;Face at the Window, The;Horror;Slaughter, Tod;;;79;No;NicholasCage.png 1958;92;Tom Thumb;Science Fiction;Tamblyn, Russ;;;30;No;NicholasCage.png 1989;90;Beartooth;Action;Taylor, Dub;;;70;No;NicholasCage.png 1979;90;James Taylor in Concert;Music;Taylor, James;;;38;No;NicholasCage.png 1942;253;Gangbusters;Drama;Taylor, Kent;;;31;No;NicholasCage.png 1992;;El Rublo de las Dos Caras;Action;Taylor, Robert;;;83;No;NicholasCage.png 1992;87;Law & Jake Wade, The;Drama;Taylor, Robert;;;68;No;NicholasCage.png 1967;105;Chuka;Western;Taylor, Rod;;;47;No;NicholasCage.png 1980;93;Cry of the Innocent;Drama;Taylor, Rod;;;13;No;NicholasCage.png 1991;108;Edison the Man;Drama;Tracy, Spencer;;;19;No;spencerTracy.png 1991;101;Keeper of the Flame;Drama;Tracy, Spencer;;;76;No;spencerTracy.png 1991;92;Spencer Tracy Legacy, The;Comedy;Tracy, Spencer;;;44;No;spencerTracy.png 1957;60;Cheyenne, The Iron Trail;Western;Walker, Clint;;;1;No;NicholasCage.png 1992;56;Dawn Rider, The;Western;Wayne, John;;;44;No;johnWayne.png 1993;;Duke, The Films of John Wayne;Western;Wayne, John;;;70;No;johnWayne.png 1939;55;Frontier Horizon;Western;Wayne, John;;;73;No;johnWayne.png 1934;54;Hell Town;Western;Wayne, John;;;23;No;johnWayne.png 1932;;Hurricane Express;Western;Wayne, John;;;7;No;johnWayne.png 1932;210;Hurricane Express, The;Action;Wayne, John;;;68;No;johnWayne.png 1965;165;In Harm's Way;War;Wayne, John;;;66;No;johnWayne.png 1991;;John Wayne Collection, Red River, The;War;Wayne, John;;;49;No;johnWayne.png 1992;;John Wayne Collector's Limited Edition;War;Wayne, John;;;3;No;johnWayne.png 1991;;John Wayne Four Pack;Western;Wayne, John;;;58;No;johnWayne.png 1939;112;John Wayne Matinee Double Feature, No. 2;Western;Wayne, John;;;3;No;johnWayne.png 1939;110;John Wayne Matinee Double Feature, No. 3;Western;Wayne, John;;;24;No;johnWayne.png 1938;110;John Wayne Matinee Double Feature, No. 4;Western;Wayne, John;;;28;No;johnWayne.png 1990;;John Wayne Six Pack;Western;Wayne, John;;;87;No;johnWayne.png 1991;;John Wayne Western Greats, Rio Bravo;Western;Wayne, John;;;22;No;johnWayne.png 1991;56;King of the Pecos;Western;Wayne, John;;;78;No;johnWayne.png 1992;59;Lawless Frontier;Western;Wayne, John;;;8;No;johnWayne.png 1991;52;Lawless Frontier, The;Western;Wayne, John;;;35;No;johnWayne.png 1991;56;Lawless Nineties, The;Western;Wayne, John;;;3;No;johnWayne.png 1934;54;Lucky Texan;Western;Wayne, John;;;48;No;johnWayne.png 1992;112;McQ;Action;Wayne, John;;;5;No;johnWayne.png 1993;;Neath Arizona Skies;Western;Wayne, John;;;73;No;johnWayne.png 1991;54;Neath the Arizona Skies;Western;Wayne, John;;;28;No;johnWayne.png 1991;53;Randy Rides Alone;Western;Wayne, John;;;75;No;johnWayne.png 1993;58;Range Feud;Western;Wayne, John;;;77;No;johnWayne.png 1992;134;Red River;Western;Wayne, John;;;16;No;johnWayne.png 1991;52;Riders of Destiny;Western;Wayne, John;;;30;No;johnWayne.png 1990;;Sagebrush Trail;Western;Wayne, John;;;23;No;johnWayne.png 1932;226;Shadow of the Eagle, The;Action;Wayne, John;;;19;No;johnWayne.png 1989;103;Blood & Guns;Action;Welles, Orson;;;43;No;NicholasCage.png 1988;78;Hot Money;Drama;Welles, Orson;;;19;No;NicholasCage.png 1977;75;Comedy Tonight;Comedy;Williams, Robin;;;18;No;NicholasCage.png 1991;65;Robin Williams;Comedy;Williams, Robin;;;4;No;NicholasCage.png ================================================ FILE: FUNDING.yml ================================================ custom: https://learndataengineering.com/p/academy ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================
Data Engineering Cookbook


What is this Book?    How to Contribute    YouTube    Twitter    Amazon Shop


## If You Like This Book & Need More Help Check out my Data Engineering Academy at LearnDataEngineering.com trusted by almost 2,000 students! **Visit learndataengineering.com:** [Click Here](https://learndataengineering.com) - Learn Data Engineering with our online Academy - Perfect for becoming a Data Engineer or add Data Engineering to your skillset - Proven process based on years of experience and hundreds of hours of personal coaching - Over 30 prepared courses on the most important techniques, fundamental tools and platforms plus our - Associate Data Engineer Certification - Academy Discord server with over 1,000 members ## Support This Book For Free! - **Amazon:** [Click Here](https://www.amazon.com/shop/plumbersofdatascience) buy whatever you like from Amazon using this link* (Also check out my complete podcast gear and books) ## Here's what's new: Find the change log with all recent updates here: [SEE UPDATES](sections/10-Updates.md) # Contents: - [Introduction](sections/01-Introduction.md) - [Basic Engineering Skills](sections/02-BasicSkills.md) - [Advanced Engineering Skills](sections/03-AdvancedSkills.md) - [Free Hands On Courses / Tutorials](sections/04-HandsOnCourse.md)‚ - [Case Studies](sections/05-CaseStudies.md) - [Best Practices Cloud Platforms](sections/06-BestPracticesCloud.md) - [130+ Data Sources Data Science](sections/07-DataSources.md) - [1001 Interview Questions](sections/08-InterviewQuestions.md) - [Recommended Books, Courses, and Podcasts](sections/09-BooksAndCourses.md) - [Updates](sections/10-Updates.md) - [How To Contribute](#how-to-contribute) - [Support What You Like](#support) - [Important Links](#important-links) # Full Table Of Contents: ## Introduction - [What is this Cookbook](sections/01-Introduction.md#what-is-this-cookbook) - [Data Engineers](sections/01-Introduction.md#data-engineers) - [My Data Science Platform Blueprint](sections/01-Introduction.md#my-data-science-platform-blueprint) - [Connect](sections/01-Introduction.md#connect) - [Buffer](sections/01-Introduction.md#buffer) - [Processing Framework](sections/01-Introduction.md#processing-framework) - [Store](sections/01-Introduction.md#store) - [Visualize](sections/01-Introduction.md#visualize) - [Who Companies Need](sections/01-Introduction.md#who-companies-need) - [How to Learn Data Engineering](sections/01-Introduction.md#how-to-learn-data-engineering) - [Andreas on the Super Data Science Podcast](sections/01-Introduction.md#Interview-with-Andreas-on-the-Super-Data-Science-Podcast) - [Building Blocks to Learn Data Engineering](sections/01-Introduction.md#building-blocks-to-learn-data-engineering) - [Roadmap for Beginners](sections/01-Introduction.md#roadmap-for-beginners) - [Roadmap for Data Analysts](sections/01-Introduction.md#roadmap-for-data-analysts) - [Roadmap for Data Scientists](sections/01-Introduction.md#roadmap-for-data-scientists) - [Roadmap for Software Engineers](sections/01-Introduction.md#roadmap-for-software-engineers) - [Data Engineers Skills Matrix](sections/01-Introduction.md#data-engineers-skills-matrix) - [How to Become a Senior Data Engineer](sections/01-Introduction.md#how-to-become-a-senior-data-engineer) ## Basic Engineering Skills - [Learn To Code](sections/02-BasicSkills.md#learn-to-code) - [Get Familiar With Git](sections/02-BasicSkills.md#get-familiar-with-git) - [Agile Development](sections/02-BasicSkills.md#agile-development) - [Why is agile so important?](sections/02-BasicSkills.md#Why-is-agile-so-important) - [Agile rules I learned over the years](sections/02-BasicSkills.md#agile-rules-i-learned-over-the-years) - [Agile Frameworks](sections/02-BasicSkills.md#agile-frameworks) - [Scrum](sections/02-BasicSkills.md#scrum) - [OKR](sections/02-BasicSkills.md#okr) - [Software Engineering Culture](sections/02-BasicSkills.md#software-engineering-culture) - [Learn how a Computer Works](sections/02-BasicSkills.md#learn-how-a-computer-works) - [Data Network Transmission](sections/02-BasicSkills.md#data-network-transmission) - [Security and Privacy](sections/02-BasicSkills.md#security-and-privacy) - [SSL Public and Private Key Certificates](sections/02-BasicSkills.md#ssl-public-and-private-key-Certificates) - [JSON Web Tokens](sections/02-BasicSkills.md#json-web-tokens) - [GDPR regulations](sections/02-BasicSkills.md#gdpr-regulations) - [Linux](sections/02-BasicSkills.md#linux) - [OS Basics](sections/02-BasicSkills.md#os-basics) - [Shell scripting](sections/02-BasicSkills.md#shell-scripting) - [Cron Jobs](sections/02-BasicSkills.md#cron-jobs) - [Packet Management](sections/02-BasicSkills.md#packet-management) - [Docker](sections/02-BasicSkills.md#docker) - [What is Docker and How it Works](sections/02-BasicSkills.md#what-is-docker-and-what-do-you-use-it-for) - [Don't Mess Up Your System](sections/02-BasicSkills.md#dont-mess-up-your-system) - [Preconfigured Images](sections/02-BasicSkills.md#preconfigured-images) - [Take it With You](sections/02-BasicSkills.md#take-it-with-you) - [Kubernetes Container Deployment](sections/02-BasicSkills.md#kubernetes-container-deployment) - [How to Create Start and Stop a Container](sections/02-BasicSkills.md#how-to-create-start-stop-a-container) - [Docker Micro Services](sections/02-BasicSkills.md#docker-micro-services) - [Kubernetes](sections/02-BasicSkills.md#kubernetes) - [Why and How To Do Docker Container Orchestration](sections/02-BasicSkills.md#why-and-how-to-do-docker-container-orchestration) - [Userful Docker Commands](sections/02-BasicSkills.md#useful-docker-commands) - [The Cloud](sections/02-BasicSkills.md#the-cloud) - [IaaS vs PaaS vs SaaS](sections/02-BasicSkills.md#iaas-vs-paas-vs-saas) - [AWS Azure IBM Google IBM](sections/02-BasicSkills.md#aws-azure-ibm-google) - [Cloud vs On-Premises](sections/02-BasicSkills.md#cloud-vs-on-premises) - [Security](sections/02-BasicSkills.md#security) - [Hybrid Clouds](sections/02-BasicSkills.md#hybrid-clouds) - [Security Zone Design](sections/02-BasicSkills.md#security-zone-design) - [How to secure a multi layered application](sections/02-BasicSkills.md#how-to-secure-a-multi-layered-application) - [Cluster security with Kerberos](sections/02-BasicSkills.md#cluster-security-with-kerberos) ## Advanced Engineering Skills - [Data Science Platform](sections/03-AdvancedSkills.md#data-science-platform) - [Why a Good Data Platform Is Important](sections/03-AdvancedSkills.md#why-a-good-data-platform-is-important) - [Big Data vs Data Science and Analytics](sections/03-AdvancedSkills.md#Big-Data-vs-Data-Science-and-Analytics) - [The 4 Vs of Big Data](sections/03-AdvancedSkills.md#the-4-vs-of-big-data) - [Why Big Data](sections/03-AdvancedSkills.md#why-big-data) - [Planning is Everything](sections/03-AdvancedSkills.md#planning-is-everything) - [The Problem with ETL](sections/03-AdvancedSkills.md#the-problem-with-etl) - [Scaling Up](sections/03-AdvancedSkills.md#scaling-up) - [Scaling Out](sections/03-AdvancedSkills.md#scaling-out) - [When not to Do Big Data](sections/03-AdvancedSkills.md#please-dont-go-big-data) - [81 Platform & Pipeline Design Questions](sections/03-AdvancedSkills.md#81-platform-and-pipeline-design-questions) - [Data Source Questions](sections/03-AdvancedSkills.md#data-source-questions) - [Goals and Destination Questions](sections/03-AdvancedSkills.md#goals-and-destination-questions) - [Connect](sections/03-AdvancedSkills.md#connect) - [REST APIs](sections/03-AdvancedSkills.md#rest-apis) - [API Design](sections/03-AdvancedSkills.md#api-design) - [Implemenation Frameworks](sections/03-AdvancedSkills.md#implementation-frameworks) - [Security](sections/03-AdvancedSkills.md#security) - [Apache Nifi](sections/03-AdvancedSkills.md#apache-nifi) - [Logstash](sections/03-AdvancedSkills.md#logstash) - [Buffer](sections/03-AdvancedSkills.md#buffer) - [Apache Kafka](sections/03-AdvancedSkills.md#apache-kafka) - [Why a Message Queue Tool?](sections/03-AdvancedSkills.md#why-a-message-queue-tool) - [Kafka Architecture](sections/03-AdvancedSkills.md#kafka-architecture) - [Kafka Topics](sections/03-AdvancedSkills.md#what-are-topics) - [Kafka and Zookeeper](sections/03-AdvancedSkills.md#what-does-zookeeper-have-to-do-with-kafka) - [How to Produce and Consume Messages](sections/03-AdvancedSkills.md#how-to-produce-and-consume-messages) - [Kafka Commands](sections/03-AdvancedSkills.md#kafka-commands) - [Apache Redis Pub-Sub](sections/03-AdvancedSkills.md#redis-pub-sub) - [AWS Kinesis](sections/03-AdvancedSkills.md#apache-kafka) - [Google Cloud PubSub](sections/03-AdvancedSkills.md#google-cloud-pubsub) - [Processing Frameworks](sections/03-AdvancedSkills.md#processing-frameworks) - [Lambda and Kappa Architecture](sections/03-AdvancedSkills.md#lambda-and-kappa-architecture) - [Batch Processing](sections/03-AdvancedSkills.md#batch-processing) - [Stream Processing](sections/03-AdvancedSkills.md#stream-processing) - [Three Methods of Streaming](sections/03-AdvancedSkills.md#three-methods-of-streaming) - [At Least Once](sections/03-AdvancedSkills.md#at-least-once) - [At Most Once](sections/03-AdvancedSkills.md#at-most-once) - [Exactly Once](sections/03-AdvancedSkills.md#exactly-once) - [Check The Tools](sections/03-AdvancedSkills.md#check-the-tools) - [Should You do Stream or Batch Processing](sections/03-AdvancedSkills.md#should-you-do-stream-or-batch-processing) - [Is ETL still relevant for Analytics?](sections/03-AdvancedSkills.md#is-etl-still-relevant-for-analytics) - [MapReduce](sections/03-AdvancedSkills.md#mapreduce) - [How Does MapReduce Work](sections/03-AdvancedSkills.md#How-does-mapreduce-work) - [MapReduce](sections/03-AdvancedSkills.md#mapreduce) - [MapReduce Example](sections/03-AdvancedSkills.md#example) - [MapReduce Limitations](sections/03-AdvancedSkills.md#What-is-the-limitation-of-mapreduce) - [Apache Spark](sections/03-AdvancedSkills.md#apache-spark) - [What is the Difference to MapReduce?](sections/03-AdvancedSkills.md#what-is-the-difference-to-MapReduce) - [How Spark Fits to Hadoop](sections/03-AdvancedSkills.md#how-does-spark-fit-to-hadoop) - [Spark vs Hadoop](sections/03-AdvancedSkills.md#wheres-the-difference) - [Spark and Hadoop a Perfect Fit](sections/03-AdvancedSkills.md#spark-and-hadoop-is-a-perfect-fit) - [Spark on YARn](sections/03-AdvancedSkills.md#spark-on-yarn) - [My Simple Rule of Thumb](sections/03-AdvancedSkills.md#my-simple-rule-of-thumb) - [Available Languages](sections/03-AdvancedSkills.md#available-languages) - [Spark Driver Executor and SparkContext](sections/03-AdvancedSkills.md#how-spark-works-driver-executor-sparkcontext) - [Spark Batch vs Stream processing](sections/03-AdvancedSkills.md#spark-batch-vs-stream-processing) - [How Spark uses Data From Hadoop](sections/03-AdvancedSkills.md#How-does-spark-use-data-from-hadoop) - [What are RDDs and How to Use Them](sections/03-AdvancedSkills.md#what-are-rdds-and-how-to-use-them) - [SparkSQL How and Why to Use It](sections/03-AdvancedSkills.md#available-languages) - [What are Dataframes and How to Use Them](sections/03-AdvancedSkills.md#what-are-dataframes-how-to-use-them) - [Machine Learning on Spark (TensorFlow)](sections/03-AdvancedSkills.md#machine-learning-on-spark-tensor-flow) - [MLlib](sections/03-AdvancedSkills.md#mllib) - [Spark Setup](sections/03-AdvancedSkills.md#spark-setup) - [Spark Resource Management](sections/03-AdvancedSkills.md#spark-resource-management) - [AWS Lambda](sections/03-AdvancedSkills.md#apache-flink) - [Apache Flink](sections/03-AdvancedSkills.md#apache-flink) - [Elasticsearch](sections/03-AdvancedSkills.md#elasticsearch) - [Apache Drill](sections/03-AdvancedSkills.md#apache-drill) - [StreamSets](sections/03-AdvancedSkills.md#streamsets) - [Store](sections/03-AdvancedSkills.md#store) - [Analytical Data Stores](03-AdvancedSkills.md#analytical-data-stores) - [Data Warehouse vs Data Lake](sections/03-AdvancedSkills.md#data-warehouse-vs-data-lake) - [Snowflake and dbt](sections/03-AdvancedSkills.md#snowflake-and-dbt) - [Transactional Data Stores](sections/03-AdvancedSkills.md#transactional-data-stores) - [SQL Databases](sections/03-AdvancedSkills.md#sql-databases) - [PostgreSQL DB](sections/03-AdvancedSkills.md#postgresql-db) - [Database Design](sections/03-AdvancedSkills.md#database-design) - [SQL Queries](sections/03-AdvancedSkills.md#sql-queries) - [Stored Procedures](sections/03-AdvancedSkills.md#stored-procedures) - [ODBC/JDBC Server Connections](sections/03-AdvancedSkills.md#odbc-jdbc-server-connections) - [NoSQL Stores](sections/03-AdvancedSkills.md#nosql-stores) - [HBase KeyValue Store](sections/03-AdvancedSkills.md#keyvalue-stores-hbase) - [HDFS Document Store](sections/03-AdvancedSkills.md#document-stores-hdfs) - [MongoDB Document Store](sections/03-AdvancedSkills.md#document-stores-mongodb) - [Elasticsearch Document Store](sections/03-AdvancedSkills.md#Elasticsearch-search-engine-and-document-store) - [Hive Warehouse](sections/03-AdvancedSkills.md#hive-warehouse) - [Impala](sections/03-AdvancedSkills.md#impala) - [Kudu](sections/03-AdvancedSkills.md#kudu) - [Apache Druid](sections/03-AdvancedSkills.md#apache-druid) - [InfluxDB Time Series Database](sections/03-AdvancedSkills.md#influxdb-time-series-database) - [Greenplum MPP Database](sections/03-AdvancedSkills.md#mpp-databases-greenplum) - [Visualize](sections/03-AdvancedSkills.md#visualize) - [Android and IOS](sections/03-AdvancedSkills.md#android-and-ios) - [API Design for Mobile Apps](sections/03-AdvancedSkills.md#how-to-design-apis-for-mobile-apps) - [Dashboards](sections/03-AdvancedSkills.md#dashboards) - [Grafana](sections/03-AdvancedSkills.md#grafana) - [Kibana](sections/03-AdvancedSkills.md#kibana) - [Webservers](sections/03-AdvancedSkills.md#how-to-use-webservers-to-display-content) - [Tomcat](sections/03-AdvancedSkills.md#tomcat) - [Jetty](sections/03-AdvancedSkills.md#jetty) - [NodeRED](sections/03-AdvancedSkills.md#nodered) - [React](sections/03-AdvancedSkills.md#react) - [Business Intelligence Tools](sections/03-AdvancedSkills.md#business-intelligence-tools) - [Tableau](sections/03-AdvancedSkills.md#tableau) - [Power BI](sections/03-AdvancedSkills.md#power-bi) - [Quliksense](sections/03-AdvancedSkills.md#quliksense) - [Identity & Device Management](sections/03-AdvancedSkills.md#Identity-and-device-management) - [What Is A Digital Twin](sections/03-AdvancedSkills.md#what-is-a-digital-twin) - [Active Directory](sections/03-AdvancedSkills.md#active-directory) - [Machine Learning](sections/03-AdvancedSkills.md#machine-learning) - [How to do Machine Learning in production](sections/03-AdvancedSkills.md#how-to-domachine-learning-in-production) - [Why machine learning in production is harder then you think](sections/03-AdvancedSkills.md#why-machine-learning-in-production-is-harder-then-you-think) - [Models Do Not Work Forever](sections/03-AdvancedSkills.md#models-do-not-work-forever) - [Where are The Platforms That Support Machine Learning](sections/03-AdvancedSkills.md#where-are-the-platforms-that-support-this) - [Training Parameter Management](sections/03-AdvancedSkills.md#training-parameter-management) - [How to Convince People That Machine Learning Works](sections/03-AdvancedSkills.md#how-to-convince-people-machine-learning-works) - [No Rules No Physical Models](sections/03-AdvancedSkills.md#no-rules-no-physical-models) - [You Have The Data. Use It!](sections/03-AdvancedSkills.md#you-have-the-data-use-it) - [Data is Stronger Than Opinions](sections/03-AdvancedSkills.md#data-is-stronger-than-opinions) - [AWS Sagemaker](sections/03-AdvancedSkills.md#aws-sagemaker) ## Hands On Course - [Free Data Engineering Course with AWS, TDengine, Docker and Grafana](sections/04-HandsOnCourse.md#free-data-engineering-course-with-aws-tdengine-docker-and-grafana) - [Monitor your data in dbt & detect quality issues with Elementary](sections/04-HandsOnCourse.md#monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary) - [Solving Engineers 4 Biggest Airflow Problems](sections/04-HandsOnCourse.md#solving-engineers-4-biggest-airflow-problems) - [The best alternative to Airlfow? Mage.ai](sections/04-HandsOnCourse.md#the-best-alternative-to-airlfow?-mage.ai) ## Case Studies - [Data Science @Airbnb](sections/05-CaseStudies.md#data-science-at-Airbnb) - [Data Science @Amazon](sections/05-CaseStudies.md#data-science-at-Amazon) - [Data Science @Baidu](sections/05-CaseStudies.md#data-science-at-Baidu) - [Data Science @Blackrock](sections/05-CaseStudies.md#data-science-at-Blackrock) - [Data Science @BMW](sections/05-CaseStudies.md#data-science-at-BMW) - [Data Science @Booking.com](sections/05-CaseStudies.md#data-science-at-Booking.com) - [Data Science @CERN](sections/05-CaseStudies.md#data-science-at-CERN) - [Data Science @Disney](sections/05-CaseStudies.md#data-science-at-Disney) - [Data Science @DLR](sections/05-CaseStudies.md#data-science-at-DLR) - [Data Science @Drivetribe](sections/05-CaseStudies.md#data-science-at-Drivetribe) - [Data Science @Dropbox](sections/05-CaseStudies.md#data-science-at-Dropbox) - [Data Science @Ebay](sections/05-CaseStudies.md#data-science-at-Ebay) - [Data Science @Expedia](sections/05-CaseStudies.md#data-science-at-Expedia) - [Data Science @Facebook](sections/05-CaseStudies.md#data-science-at-Facebook) - [Data Science @Google](sections/05-CaseStudies.md#data-science-at-Google) - [Data Science @Grammarly](sections/05-CaseStudies.md#data-science-at-Grammarly) - [Data Science @ING Fraud](sections/05-CaseStudies.md#data-science-at-ING-Fraud) - [Data Science @Instagram](sections/05-CaseStudies.md#data-science-at-Instagram) - [Data Science @LinkedIn](sections/05-CaseStudies.md#data-science-at-LinkedIn) - [Data Science @Lyft](sections/05-CaseStudies.md#data-science-at-Lyft) - [Data Science @NASA](sections/05-CaseStudies.md#data-science-at-NASA) - [Data Science @Netflix](sections/05-CaseStudies.md#data-science-at-Netflix) - [Data Science @OLX](sections/05-CaseStudies.md#data-science-at-OLX) - [Data Science @OTTO](sections/05-CaseStudies.md#data-science-at-OTTO) - [Data Science @Paypal](sections/05-CaseStudies.md#data-science-at-Paypal) - [Data Science @Pinterest](sections/05-CaseStudies.md#data-science-at-Pinterest) - [Data Science @Salesforce](sections/05-CaseStudies.md#data-science-at-Salesforce) - [Data Science @Siemens Mindsphere](sections/05-CaseStudies.md#data-science-at-Siemens-Mindsphere) - [Data Science @Slack](sections/05-CaseStudies.md#data-science-at-Slack) - [Data Science @Spotify](sections/05-CaseStudies.md#data-science-at-Spotify) - [Data Science @Symantec](sections/05-CaseStudies.md#data-science-at-Symantec) - [Data Science @Tinder](sections/05-CaseStudies.md#data-science-at-Tinder) - [Data Science @Twitter](sections/05-CaseStudies.md#data-science-at-Twitter) - [Data Science @Uber](sections/05-CaseStudies.md#data-science-at-Uber) - [Data Science @Upwork](sections/05-CaseStudies.md#data-science-at-Upwork) - [Data Science @Woot](sections/05-CaseStudies.md#data-science-at-Woot) - [Data Science @Zalando](sections/05-CaseStudies.md#data-science-at-Zalando) ## Best Practices Cloud Platforms - [Amazon Web Services (AWS)](sections/06-BestPracticesCloud.md#aws) - [Connect](sections/06-BestPracticesCloud.md#Connect) - [Buffer](sections/06-BestPracticesCloud.md#Buffer) - [Processing](sections/06-BestPracticesCloud.md#Processing) - [Store](sections/06-BestPracticesCloud.md#Store) - [Visualize](sections/06-BestPracticesCloud.md#Visualize) - [Containerization](sections/06-BestPracticesCloud.md#Containerization) - [Best Practices](sections/06-BestPracticesCloud.md#Best-Practices) - [More Details](sections/06-BestPracticesCloud.md#More-Details) - [Microsoft Azure](sections/06-BestPracticesCloud.md#azure) - [Connect](sections/06-BestPracticesCloud.md#Connect-1) - [Buffer](sections/06-BestPracticesCloud.md#Buffer-1) - [Processing](sections/06-BestPracticesCloud.md#Processing-1) - [Store](sections/06-BestPracticesCloud.md#Store-1) - [Visualize](sections/06-BestPracticesCloud.md#Visualize-1) - [Containerization](sections/06-BestPracticesCloud.md#Containerization-1) - [Best Practices](sections/06-BestPracticesCloud.md#Best-Practices-1) - [Google Cloud Platform (GCP)](sections/06-BestPracticesCloud.md#gcp) - [Connect](sections/06-BestPracticesCloud.md#Connect-2) - [Buffer](sections/06-BestPracticesCloud.md#Buffer-2) - [Processing](sections/06-BestPracticesCloud.md#Processing-2) - [Store](sections/06-BestPracticesCloud.md#Store-2) - [Visualize](sections/06-BestPracticesCloud.md#Visualize-2) - [Containerization](sections/06-BestPracticesCloud.md#Containerization-2) - [Best Practices](sections/06-BestPracticesCloud.md#Best-Practices-2) ## 130+ Free Data Sources For Data Science - [Student Favorites](sections/07-DataSources.md#Student-Favorites) - [General And Academic](sections/07-DataSources.md#General-And-Academic) - [Content Marketing](sections/07-DataSources.md#Content-Marketing) - [Crime](sections/07-DataSources.md#Crime) - [Drugs](sections/07-DataSources.md#Drugs) - [Education](sections/07-DataSources.md#Education) - [Entertainment](sections/07-DataSources.md#Entertainment) - [Environmental And Weather Data](sections/07-DataSources.md#Environmental-And-Weather-Data) - [Financial And Economic Data](sections/07-DataSources.md#Financial-And-Economic-Data]) - [Government And World](sections/07-DataSources.md#Government-And-World) - [Health](sections/07-DataSources.md#Health) - [Human Rights](sections/07-DataSources.md#Human-Rights) - [Labor And Employment Data](sections/07-DataSources.md#Labor-And-Employment-Data) - [Politics](sections/07-DataSources.md#Politics) - [Retail](sections/07-DataSources.md#Retail) - [Social](sections/07-DataSources.md#Social) - [Travel And Transportation](sections/07-DataSources.md#Travel-And-Transportation) - [Various Portals](sections/07-DataSources.md#Various-Portals) - [Source Articles and Blog Posts](sections/07-DataSources.md#Source-Articles-and-Blog-Posts) - [Free Data Sources Data Science](sections/07-DataSources.md) ## 1001 Interview Questions - [Interview Questions](sections/08-InterviewQuestions.md) ## Recommended Books, Courses, and Podcasts - [About Books and Courses](sections/09-BooksAndCourses.md#about-books-and-courses) - [Books](sections/09-BooksAndCourses.md#books) - [Languages](sections/09-BooksAndCourses.md#books-languages) - [Data Tools & Platforms](sections/09-BooksAndCourses.md#books-data-science-tools) - [Business](sections/09-BooksAndCourses.md#Books-Business) - [Community Recommendations](sections/09-BooksAndCourses.md#Community-Recommendations) - [Online Courses](sections/09-BooksAndCourses.md#online-courses) - [Preparation courses](sections/09-BooksAndCourses.md#Preparation-courses) - [Data engineering courses](sections/09-BooksAndCourses.md#Data-engineering-courses) - [Certifications](sections/09-BooksAndCourses.md#Certifications) - [Podcasts](sections/09-BooksAndCourses.md#Podcasts) - [Super Data Science](sections/09-BooksAndCourses.md#Super-Data-Science) - [Data Skeptic](sections/09-BooksAndCourses.md#Data-Skeptic) - [Data Engineering Podcast](sections/09-BooksAndCourses.md#Data-Engineering-Podcast) - [Roaring Elephant BiteSized Big Tech](sections/09-BooksAndCourses.md#Roaring-Elephant-BiteSized-Big-Tech) - [SQL Data Partners Podcast](sections/09-BooksAndCourses.md#SQL-Data-Partners-Podcast) ## How To Contribute If you have some cool links or topics for the cookbook, please become a contributor. Simply pull the repo, add your ideas and create a pull request. You can also open an issue and put your thoughts there. Please use the "Issues" function for comments. ## Important Links Subscribe to my YouTube channel for regular updates: [Link to YouTube](https://www.youtube.com/channel/UCY8mzqqGwl5_bTpBY9qLMAA) I have a Medium publication where you can publish your data engineer articles to reach more people: [Medium publication](https://link.medium.com/9oi1VDrhPW)
*(As an Amazon Associate I earn from qualifying purchases from Amazon This is free of charge for you, but super helpful for supporting this channel) ================================================ FILE: sections/01-Introduction.md ================================================ Introduction ============ ## Contents - [What is this Cookbook](01-Introduction.md#what-is-this-cookbook) - [Data Engineers](01-Introduction.md#data-engineers) - [My Data Science Platform Blueprint](01-Introduction.md#my-data-science-platform-blueprint) - [Connect](01-Introduction.md#connect) - [Buffer](01-Introduction.md#buffer) - [Processing Framework](01-Introduction.md#processing-framework) - [Store](01-Introduction.md#store) - [Visualize](01-Introduction.md#visualize) - [Who Companies Need](01-Introduction.md#who-companies-need) - [How to Learn Data Engineering](01-Introduction.md#how-to-learn-data-engineering) - [Andreas interview on the Super Data Science Podcast](01-Introduction.md#Interview-with-Andreas-on-the-Super-Data-Science-Podcast) - [Building Blocks to Learn Data Engineering](01-Introduction.md#building-blocks-to-learn-data-engineering) - [Roadmap for Beginners](01-Introduction.md#roadmap-for-data-analysts) - [Roadmap for Data Analysts](01-Introduction.md#roadmap-for-data-analysts) - [Roadmap for Data Scientists](01-Introduction.md#roadmap-for-data-scientists) - [Roadmap for Software Engineers](01-Introduction.md#roadmap-for-software-engineers) - [Data Engineers Skills Matrix](01-Introduction.md#data-engineers-skills-matrix) - [How to Become a Senior Data Engineer](01-Introduction.md#how-to-become-a-senior-data-engineer) ## What is this Cookbook I get asked a lot: "What do you actually need to learn to become an awesome data engineer?" Well, look no further. You'll find it here! If you are looking for AI algorithms and such data scientist things, this book is not for you. **How to use this Cookbook:** This book is intended to be a starting point for you. It is not a training! I want to help you to identify the topics to look into to become an awesome data engineer in the process. It hinges on my Data Science Platform Blueprint. Check it out below. Once you understand it, you can find in the book tools that fit into each key area of a Data Science platform (Connect, Buffer, Processing Framework, Store, Visualize). Select a few tools you are interested in, then research and work with them. Don't learn everything in this book! Focus. **What types of content are in this book?** You are going to find five types of content in this book: Articles I wrote, links to my podcast episodes (video & audio), more than 200 links to helpful websites I like, data engineering interview questions and case studies. **This book is a work in progress!** As you can see, this book is not finished. I'm constantly adding new stuff and doing videos for the topics. But, obviously, because I do this as a hobby, my time is limited. You can help make this book even better. **Help make this book awesome!** If you have some cool links or topics for the cookbook, please become a contributor on GitHub: . Fork the repo, add them, and create a pull request. Or join the discussion by opening Issues. Tell me your thoughts, what you value, what you think should be included, or correct me where I am wrong. You can also write me an email any time to plumbersofdatascience\@gmail.com anytime. **This Cookbook is and will always be free!** ## If You Like This Book & Need More Help: Check out my Data Engineering Academy at LearnDataEngineering.com **Visit learndataengineering.com:** [Click Here](https://learndataengineering.com) - Huge Step by step Data Engineering Academy with over 30 courses - Unlimited access incl. future courses during subsciption - Access to all courses and example projects in the Academy - Associate Data Engineer Certification - Data Engineering on AWS E-Commerce example project - Microsoft Azure example project - Document Streaming example project with Docker, FastAPI, Apache Kafka, Apache Spark, - MongoDB and Streamlit - Time Series example project with InfluxDB and Grafana - Lifetime access to the private Discord Workspace - Course certificates - Currently over 54 hours of videos ## Support This Book For Free! - **Amazon:** [Click Here](https://www.amazon.com/shop/plumbersofdatascience) buy whatever you like from Amazon using this link* (Also check out my complete podcast gear and books) ## How To Contribute If you have some cool links or topics for the cookbook, please become a contributor. Simply pull the repo, add your ideas and create a pull request. You can also open an issue and put your thoughts there. Please use the "Issues" function for comments. Data Engineers ------------------------------- Data Engineers are the link between the management's data strategy and the data scientists or analysts that need to work with data. What they do is build the platforms that enable data scientists to do their magic. These platforms are usually used in five different ways: - Data ingestion and storage of large amounts of data. - Algorithm creation by data scientists. - Automation of the data scientist's machine learning models and algorithms for production use. - Data visualization for employees and customers. - Most of the time these guys start as traditional solution architects for systems that involve SQL databases, web servers, SAP installations and other "standard" systems. But, to create big data platforms, the engineer needs to be an expert in specifying, setting up, and maintaining big data technologies like: Hadoop, Spark, HBase, Cassandra, MongoDB, Kafka, Redis, and more. What they also need is experience on how to deploy systems on cloud infrastructure like at Amazon or Google, or on-premise hardware. | Podcast Episode: #048 From Wannabe Data Scientist To Engineer My Journey |------------------| |In this episode Kate Strachnyi interviews me for her humans of data science podcast. We talk about how I found out that I am more into the engineering part of data science. | [Watch on YouTube](https://youtu.be/pIZkTuN5AMM) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/048-From-Wannabe-Data-Scientist-To-Engineer-My-Journey-e45i2o)| ## My Data Science Platform Blueprint I have created a simple and modular big data platform blueprint. It is based on what I have seen in the field and read in tech blogs all over the internet. Why do I believe it will be super useful to you? Because, unlike other blueprints, it is not focused on technology. Following my blueprint will allow you to create the big data platform that fits exactly your needs. Building the perfect platform will allow data scientists to discover new insights. It will enable you to perfectly handle big data and allow you to make data-driven decisions. The blueprint is focused on the five key areas: Connect, Buffer, Processing Frameworks, Store, and Visualize. ![Data Science Platform Blueprint](/images/Data-Science-Blueprint-New.jpg) Having the platform split like this turns it into a modular platform with loosely coupled interfaces. Why is it so important to have a modular platform? If you have a platform that is not modular, you end up with something that is fixed or hard to modify. This means you can not adjust the platform to changing requirements of the company. Because of modularity, it is possible to specifically select tools for your use case. It also allows you to replace every component, if you need it. Now, lets talk more about each key area. ### Connect Ingestion is all about getting the data in from the source and making it available to later stages. Sources can be everything from tweets to server logs, to IoT sensor data (e.g. from cars). Sources send data to your API Services. The API is going to push the data into temporary storage. The temporary storage allows other stages simple and fast access to incoming data. A great solution is to use messaging queue systems like Apache Kafka, RabbitMQ or AWS Kinesis. Sometimes people also use caches for specialised applications like Redis. A good practice is that the temporary storage follows the publish-subscribe pattern. This way APIs can publish messages and Analytics can quickly consume them. ### Buffer In the buffer phase you have pub/sub systems like Apache Kafka, Redis, or other Cloud tools like Google pub/sub or AWS Kinesis. These systems are more or less message Queues. You put something in on one side and take it out on the other. The idea behind buffers is to have an intermediate system for the incoming data. How this works is, for instance, you're getting data in from from an API. The API is publishing into the message queue. Data is buffered there until it is picked up by the processing. If you don't have a buffer, you can run into problems when writing directly into a store or you're processing the data directly. You can always have peaks of incoming data that stall the systems. Like, it's lunch break and people are working with your app way more than usual. There's more data coming in very very fast, faster than the analytics of the storage can handle. In this case, you would run into problems, because the whole system would stall. It would therefore take long to process the data, and your customers would be annoyed. With a buffer, you buffer the incoming data. Processes for storage and analytics can take out only as much data as they can process. You are no longer in danger of overpowering systems. Buffers are also really good for building pipelines. You take data out of Kafka, pre-process it, and put it back into Kafka. Then, with another analytics process, you take the processed data back out and put it into a store. Ta-da! A pipeline. ### Processing Framework The analyse stage is where the actual analytics is done in the form of stream and batch processing. Streaming data is taken from ingest and fed into analytics. Streaming analyses the "live" data, thus generating fast results. As the central and most important stage, analytics also has access to the big data storage. Because of that connection, analytics can take a big chunk of data and analyse it. This type of analysis is called batch processing. It will deliver you answers for the big questions. For a short video about batch and stream processing and their use cases, click on the link below: [Adding Batch to a Streaming Pipeline](https://www.youtube.com/watch?v=o-aGi3FmdfU) The analytics process, batch or streaming, is not a one-way process. Analytics can also write data back to the big data storage. Oftentimes, writing data back to the storage makes sense. It allows you to combine previous analytics outputs with the raw data. Analytics give insights when you combine raw data. This combination will often allow you to create even more useful insights. A wide variety of analytics tools are available. Ranging from MapReduce or AWS Elastic MapReduce to Apache Spark and AWS lambda. ### Store This is the typical big-data storage where you just store everything. It enables you to analyse the big picture. Most of the data might seem useless for now, but it is of utmost importance to keep it. Throwing data away is a big no-no. Why not throw something away when it is useless? Although it seems useless for now, data scientists can work with the data. They might find new ways to analyse the data and generate valuable insights from it. What kind of systems can be used to store big data? Systems like Hadoop HDFS, Hbase, Amazon S3 or DynamoDB are a perfect fit to store big data. Check out my podcast how to decide between SQL and NoSQL: ### Visualize Displaying data is as important as ingesting, storing, and analysing it. Visualizations enable business users to make data-driven decisions. This is why it is important to have a good visual presentation of the data. Sometimes you have a lot of different use cases or projects using the platform. It might not be possible to build the perfect UI that fits everyone's needs. What you should do in this case is enable others to build the perfect UI themselves. How to do that? By creating APIs to access the data and making them available to developers. Either way, UI or API, the trick is to give the display stage direct access to the data in the big-data cluster. This kind of access will allow the developers to use analytics results as well as raw data to build the perfect application. ## Who Companies Need For a company, it is important to have well-trained data engineers. That's why companies are looking for people with experience of tools in every part of the above platform blueprint. One common theme I see is cloud platform experience on AWS, Azure or GCP. ## How to Learn Data Engineering ### Interview with Andreas on the Super Data Science Podcast #### Summary This interview with Andreas on Jon Krohn's Super Data Science podcast delves into the intricacies of data engineering, highlighting its critical role in the broader data science ecosystem. Andreas, calling from Northern Bavaria, Germany, shares his journey from a data analyst to becoming a renowned data engineering educator through his Learn Data Engineering Academy. The conversation touches upon the foundational importance of data engineering in ensuring data quality, scalability, and accessibility for data scientists and analysts. Andreas emphasizes that the best data engineers often have a background in the companies domain/niche, which equips them with a deep understanding of the end user's needs. The discussion also explores the essential tools and skills required in the field, such as relational databases, APIs, ETL tools, data streaming with Kafka, and the significance of learning platforms like AWS, Azure, and GCP. Andreas highlights the evolving landscape of data engineering, with a nod towards the emergence of roles like analytics engineers and the increasing importance of automation and advanced data processing tools like Snowflake, Databricks, and DBT. The interview is not just a technical deep dive but also a personal journey of discovery and passion for data engineering, underscoring the perpetual learning and adaptation required in the fast-evolving field of data science. | Watch or listen to this interview -> 657: How to Learn Data Engineering — with Andreas Kretz |------------------| | Was super fun talking with Jon about Data Engineering on the podcast. Think this will be very helpful for you :) | [Watch on YouTube](https://youtu.be/sbDFADS-zo8) / [Listen to the Podcast](https://www.superdatascience.com/podcast/how-to-learn-data-engineering)| #### Q&A Highlights **Q: What is data engineering, and why is it important?** A: Data engineering is the foundation of the data science process, focusing on collecting, cleaning, and managing data to make it accessible and usable for data scientists and analysts. It's crucial for automating data processes, ensuring data quality, and enabling scalable data analysis and machine learning models. **Q: How does one transition from data analysis to data engineering?** A: The transition involves gaining a deep understanding of data pipelines, learning to work with various data processing and management tools, and developing skills in programming languages and technologies relevant to data engineering, such as SQL, Python, and cloud platforms like AWS or Azure. **Q: What are the key skills and tools for a data engineer?** A: Essential skills include proficiency in SQL, experience with ETL tools, knowledge of programming languages like Python, and familiarity with cloud services and data processing frameworks like Apache Spark. Tools like Kafka for data streaming and platforms like Snowflake and Databricks are also becoming increasingly important. **Q: Can you elaborate on the emerging role of analytics engineers?** A: Analytics engineers focus on bridging the gap between raw data management and data analysis, working closely with data warehouses and using tools like dbt to prepare and model data for easy analysis. This role is pivotal in making data more accessible and actionable for decision-making processes. **Q: What advice would you give to someone aspiring to become a data engineer?** A: Start by mastering the basics of SQL and Python, then explore and gain experience with various data engineering tools and technologies. It's also important to understand the data science lifecycle and how data engineering fits within it. Continuous learning and staying updated with industry trends are key to success in this field. **Q: How does a data engineer's role evolve with experience?** A: A data engineer's journey typically starts with focusing on specific tasks or segments of data pipelines, using a limited set of tools. As they gain experience, they broaden their skill set, manage entire data pipelines, and take on more complex projects. Senior data engineers often lead teams, design data architectures, and collaborate closely with data scientists and business stakeholders to drive data-driven decisions. **Q: What distinguishes data engineering from machine learning engineering?** A: While both fields overlap, especially in the use of data, data engineering focuses on the infrastructure and processes for handling data, ensuring its quality and accessibility. Machine learning engineering, on the other hand, centers on deploying and maintaining machine learning models in production environments. A strong data engineering foundation is essential for effective machine learning engineering. **Q: Why might a data analyst transition to data engineering?** A: Data analysts may transition to data engineering to work on more technical aspects of data handling, such as building and maintaining data pipelines, automating data processes, and ensuring data scalability. This transition allows them to have a more significant impact on the data lifecycle and contribute to more strategic data initiatives within an organization. **Q: Can you share a challenging project you worked on as a data engineer?** A: One challenging project involved creating a scalable data pipeline for real-time processing of machine-generated data. The complexity lay in handling vast volumes of data, ensuring its quality, and integrating various data sources while maintaining high performance. This project highlighted the importance of selecting the right tools and technologies, such as Kafka for data streaming and Apache Spark for data processing, to meet the project's demands. **Q: How does the cloud influence data engineering?** A: Cloud platforms like AWS, Azure, and GCP have transformed data engineering by providing scalable, flexible, and cost-effective solutions for data storage, processing, and analysis. They offer a wide range of services and tools that data engineers can leverage to build robust data pipelines and infrastructure, facilitating easier access to advanced data processing capabilities and enabling more innovative data solutions. **Q: What future trends do you see in data engineering?** A: Future trends in data engineering include the increasing adoption of cloud-native services, the rise of real-time data processing and analytics, greater emphasis on data governance and security, and the continued growth of machine learning and AI-driven data processes. Additionally, tools and platforms that simplify data engineering tasks and enable more accessible data integration and analysis will become more prevalent, democratizing data across organizations. **Q: How does the background of a data analyst contribute to their success as a data engineer?** A: Data analysts have a unique advantage when transitioning to data engineering due to their understanding of data's end-use. Their experience in analyzing data gives them insights into what makes data valuable and usable, enabling them to design more effective and user-centric data pipelines and storage solutions. **Q: What role does automation play in data engineering?** A: Automation is crucial in data engineering for scaling data processes, reducing manual errors, and ensuring consistency in data handling. Automated data pipelines allow for real-time data processing and integration, making data more readily available for analysis and decision-making. **Q: Can you discuss the significance of cloud platforms in data engineering?** A: Cloud platforms like AWS, Azure, and GCP offer scalable, flexible, and cost-effective solutions for data storage, processing, and analysis. They provide data engineers with a suite of tools and services to build robust data pipelines, implement machine learning models, and manage large volumes of data efficiently. **Q: How does data engineering support data science and machine learning projects?** A: Data engineering lays the groundwork for data science and machine learning by preparing and managing the data infrastructure. It ensures that high-quality, relevant data is available for model training and analysis, thereby enabling more accurate predictions and insights. **Q: What emerging technologies or trends should data engineers be aware of?** A: Data engineers should keep an eye on the rise of machine learning operations (MLOps) for integrating machine learning models into production, the growing importance of real-time data processing and analytics, and the adoption of serverless computing for more efficient resource management. Additionally, technologies like containerization (e.g., Docker) and orchestration (e.g., Kubernetes) are becoming critical for deploying and managing scalable data applications. **Q: What challenges do data engineers face, and how can they be addressed?** A: Data engineers often grapple with data quality issues, integrating disparate data sources, and scaling data infrastructure to meet growing data volumes. Addressing these challenges requires a solid understanding of data architecture principles, continuous monitoring and testing of data pipelines, and adopting best practices for data governance and management. **Q: How important is collaboration between data engineers and other data professionals?** A: Collaboration is key in the data ecosystem. Data engineers need to work closely with data scientists, analysts, and business stakeholders to ensure that data pipelines are aligned with business needs and analytical goals. Effective communication and a shared understanding of data objectives are vital for the success of data-driven projects. ### Building Blocks to Learn Data Engineering The following Roadmaps all hinge on the courses in my Data Engineering Academy. They are designed to help students who come from many different professions and enable to build a customized curriculum. Here are all the courses currently available February 2024: **Colors:** Blue (The Basics), Green (Platform & Pipeline Fundamentals), Orange (Fundamental Tools), Red (Example Projects) ![Building blocks of your curriculum](/images/All-Courses-at-Learn-Data-Engineering.jpg) ### Roadmap for Beginners Start this roadmap at my Academy: [Start Today](https://learndataengineering.com/p/data-engineering-for-beginners) #### 11-Week Data Engineering Roadmap for Beginners & Graduates #### Master the Fundamentals and Build Your First Data Pipelines #### Starting in Data Engineering Starting in data engineering can feel overwhelming, especially if you’re coming from a non-technical background or have only limited experience with coding and databases. This 11-week roadmap, with a time commitment of 5–10 hours per week, is designed to help you build strong foundations in data engineering, step by step, before moving into cloud platforms and more advanced pipelines. You’ll learn essential concepts, hands-on coding, data modeling, and cloud ETL development—everything you need to kickstart your career as a data engineer. --- #### Why This Roadmap is for You - You’re just starting in data engineering and need a clear learning path - You want to build a strong foundation in data platforms, SQL, and Python - You need hands-on experience with data modeling, cloud ETL, and automation - You want to work on real-world projects that prepare you for a data engineering job By the end of this roadmap, you’ll have the skills, tools, and project experience to confidently apply for entry-level data engineering roles and start your career in the field. ![Building blocks of your curriculum](/images/Roadmap-For-Beginners.jpg) --- #### What You’ll Achieve in This Roadmap This roadmap is structured to help you understand the full data engineering workflow: from learning the fundamentals of data platforms and modeling to working with Python, SQL, and cloud-based ETL pipelines. #### Learning Goals | Goal | Description | | ----------- | --------------------------------------------------- | | **Goal #1** | Gain Experience in Data Platforms & Pipeline Design | | **Goal #2** | Work with Data Like a Data Engineer Using Python & SQL | | **Goal #3** | Learn Dimensional Data Modeling & Data Warehousing with Snowflake | | **Goal #4** | Gain Experience with ELT Using dbt & Orchestration with Airflow | | **Goal #5** | Build Your First ETL Pipeline on a Cloud Platform | --- #### 11-Week Learning Roadmap | Week | Topic | Key Learning Outcomes | | --------------- | ----------------------------------------- | ------------------------------------------------------------------------------- | | **Week 1** | Introduction & Platform & Pipeline Design | Understand data platforms, data pipelines, and the tools used in data engineering | | **Week 2** | Relational Data Modeling | Develop skills in creating relational data models for structured data | | **Week 3 & 4** | Python for Data Engineers | Learn Python for data processing, data manipulation, and pipeline development | | **Week 5** | Advanced SQL | Gain expertise in querying, storing, and manipulating data in relational databases | | **Week 6** | Dimensional Data Modeling | Master the techniques of dimensional modeling for analytics and reporting | | **Week 7** | Snowflake Data Warehousing | Learn how to use Snowflake as a cloud data warehouse | | **Week 8** | Data Transformation with dbt | Transform and model data efficiently using dbt | | **Week 9** | Data Pipeline Orchestration with Airflow | Automate and manage data workflows using Apache Airflow | | **Week 10 & 11**| End-to-End Project on AWS, Azure, or GCP | Complete an end-to-end project on a cloud platform of your choice | --- #### Week 1: Introduction & Platform & Pipeline Design ##### 1. Learn the Basics of Platform & Pipeline Design ##### Data Platform and Pipeline Design **Learn how to build data pipelines with templates and examples for Azure, GCP, and Hadoop** ##### Description Data pipelines are the backbone of any Data Science platform. They are essential for data ingestion, processing, and machine learning workflows. This training will help you understand how to create stream and batch processing pipelines as well as machine learning pipelines by going through the most essential basics—complemented by templates and examples for useful cloud computing platforms. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-pipeline-design) ##### Detailed Course Curriculum | Module | Lesson | Duration | |--------|--------|----------| | **Platform & Pipeline Basics** | The Platform Blueprint | 10:11 | | | Data Engineering Tools Guide | 2:44 | | | End-to-End Pipeline Example | 6:18 | | **Ingestion Pipelines** | Push Ingestion Pipelines | 3:42 | | | Pull Ingestion Pipelines | 3:34 | | **Pipeline Types** | Batch Pipelines | 3:07 | | | Streaming Pipelines | 3:34 | | **Visualization** | Stream Analytics | 2:26 | | | Visualization Pipelines | 3:47 | | | Visualization with Hive & Spark on Hadoop | 6:21 | | | Visualization Data via Spark Thrift Server | 3:27 | | **Platform Examples** | AWS, Azure, GCP (Currently Slides Only) | START | --- ##### 2. Get to Know the Different Data Stores ##### Choosing Data Stores **Learn the different types of data storages and when to use which** ##### Description One part of creating a data platform and pipelines is to choose data stores, which is the focus of this training. You will learn about relational databases, NoSQL databases, data warehouses, and data lakes. The goal is to help you understand when to use each type of data storage and how to incorporate them into your pipeline. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/choosing-data-stores) ##### Detailed Course Curriculum | Module | Lesson | Duration | |--------|--------|----------| | | What are Data Stores? | 2:09 | | **Data Stores Basics** | OLTP vs OLAP | 7:34 | | | ETL vs ELT | 5:45 | | | Data Stores Ranking | 4:05 | | **Relational Databases** | How to Choose Data Stores | 8:11 | | | Relational Databases Concepts | 6:34 | | **NoSQL Databases** | NoSQL Basics | 10:39 | | | Document Stores | 5:56 | | | Time Series Databases | 5:00 | | | Search Engines | 4:18 | | | Wide Column Stores | 4:22 | | | Key Value Stores | 4:59 | | | Graph Databases | 1:05 | | **Data Warehouses & Data Lakes** | Data Warehouses | 5:32 | | | Data Lakes | 7:10 | --- #### 3. See Data Modeling Examples for the Learned Data Stores ##### Data Modeling 1 **Learn how to design schemas for SQL, NoSQL, and Data Warehouses** ##### Description Schema design is a critical skill for data engineers. This training covers schema design for different data stores using an e-commerce dataset. You will see examples of how the same dataset is modeled for relational databases, NoSQL stores, wide column stores, document stores, key-value stores, and data warehouses. This will help you understand how to create maintainable models and avoid data swamps. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-modeling) ##### Detailed Course Curriculum | Module | Lesson | Duration | |--------|--------|----------| | | Why Data Modeling Is Important | 5:44 | | | A Good Dataset | 1:28 | | **Relational Databases** | Schema Design | 9:27 | | **Wide Column Stores** | Schema Design | 7:35 | | **Document Stores** | Schema Design | 7:28 | | **Key Value Stores** | Schema Design | 4:49 | | **Data Warehouses** | Schema Design | 4:44 | | **Data Modeling Workshop** | November 2024 | 101:49 | --- #### Week 2: Relational Data Modeling ##### Start with Relational Data Modeling **Relational Data modeling** is an essential skill, as even in modern "big data" environments, relational databases are often used for managing and serving metadata. This week focuses on building a strong foundation in relational data modeling, which is crucial for structuring data effectively and optimizing query performance. ##### Relational Data Modeling **Learn the most important basics to create a data model for OLTP data stores** ###### Description This course covers everything you need to know about relational data modeling—from understanding entities, attributes, and relationships to normalizing data models up to the third normal form (3NF). You will learn how to design conceptual, logical, and physical data models, implement primary and foreign keys, and ensure data quality through constraints and validations. Practical exercises include setting up a MySQL server with Docker and creating ER diagrams using MySQL Workbench. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/relational-data-modeling) ##### Detailed Course Curriculum | Module | Lesson | Duration | |--------|--------|----------| | **Basics and Prepare the Environment** | Relational Data Models History | 3:16 | | | Installing MySQL Server and MySQL Workbench | 8:04 | | | MySQL Workbench Introduction | 4:36 | | **Create the Conceptual Data Model** | The Design Process Explained | 4:14 | | | Discover the Entities | 10:24 | | | Discover the Attributes | 13:09 | | | Define Entity Relationships and Normalize the Data | 11:19 | | **Defining and Resolving Relationships** | Identifying vs Non-Identifying Relationships | 2:01 | | | How to Resolve Many-to-Many Relationships | 4:00 | | | How to Resolve One-to-Many Relationships | 2:34 | | | How to Resolve One-to-One Relationships | 1:45 | | **Hands-On Workbench - Creating the Database** | Create Your ER Diagram Using Workbench | 19:46 | | | Create a Physical Data Model | 4:13 | | | Populate the MySQL DB with Data from .xls File | 15:13 | --- #### Week 3 & 4: Python for Data Engineers ##### Description This course offers a comprehensive guide to using Python for data engineering tasks. You’ll learn advanced Python features, including data processing with Pandas, working with APIs, interacting with PostgreSQL databases, and handling data types like JSON. The course also covers important programming concepts like exception handling, modules, unit testing, and object-oriented programming—all within the context of data engineering. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/python-for-data-engineers) ##### Detailed Course Curriculum | Module | Lesson | Duration | |--------|--------|----------| | **Advanced Python** | Classes | 4:37 | | | Modules | 3:06 | | | Exception Handling | 8:55 | | | Logging | 5:12 | | **Data Engineering** | Datetime | 8:04 | | | JSON | 9:54 | | | JSON Validation | 15:10 | | | UnitTesting | 16:44 | | | Pandas: Intro & Data Types | 8:43 | | | Pandas: Appending & Merging DataFrames | 7:49 | | | Pandas: Normalizing & Lambdas | 4:12 | | | Pandas: Pivot & Parquet Write, Read | 6:17 | | | Pandas: Melting & JSON Normalization | 8:15 | | | Numpy | 4:47 | | **Working with Data Sources/Sinks** | Requests (Working with APIs) | 11:15 | | | Working with Databases: Setup | 4:06 | | | Working with Databases: Tables, Bulk Load, Queries | 8:12 | --- #### Week 5: SQL for Data Engineers ##### Description SQL is the backbone of working with relational databases, and if you’re getting into Data Engineering, mastering SQL is a must. This course provides the essential SQL skills needed to work with databases effectively. You'll learn how to manage data, build efficient queries, and perform advanced operations to handle real-world data challenges. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/sql-for-data-engineers) ##### Detailed Course Curriculum | Module | Lesson | Duration | |--------|--------|----------| | **Basics** | Database Management Systems & SQL | 3:49 | | | The Chinook Database | 3:03 | | | SQLite Installation | 7:02 | | | DBeaver Installation | 4:08 | | | Data Types in SQLite | 6:15 | | **Basic SQL** | DML & DDL | 15:06 | | | Select Statements | 6:03 | | | Grouping & Aggregation | 10:12 | | | Joins | 10:05 | | **Advanced SQL** | TCP Transaction Control Language | 6:42 | | | Common Table Expressions & Subqueries | 10:26 | | | Window Functions 1: Concept & Syntax | 5:00 | | | Window Functions 2: Aggregate Functions | 7:24 | | | Window Functions 3: Ranking Functions | 6:05 | | | Window Functions 4: Analytical Functions | 7:20 | | **Optimization** | Query Optimization | START | | | Indexing Best Practices | START | --- #### Week 6: Dimensional Data Modeling ##### Description Dimensional data modeling is a crucial skill for data engineers working with analytics use-cases where data needs to be structured efficiently for reporting and business insights. This course covers the basics of dimensional modeling, the medallion architecture, and how to create data models for OLAP data stores. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-modeling-3-dimensional-data-modeling) ##### Detailed Course Curriculum | Module | Lesson | Duration | |--------|--------|----------| | | Data Warehousing Basics | 6:42 | | **Dimensional Modeling Basics** | Approaches to building a data warehouse | 5:20 | | | Dimension tables explained | 5:34 | | | Fact tables explained | 6:34 | | | Identifying dimensions | 3:16 | | **Data Warehouse Setup** | What is DuckDB | 5:58 | | | First DuckDB hands-on | 2:20 | | | Creating tables in DuckDB | 2:40 | | | Installing DBeaver | 6:49 | | **Working With The Data Warehouse** | Exploring SCD0 and SCD1 | 19:57 | | | Exploring SCD2 | 13:52 | | | Exploring transaction fact table | 6:28 | | | Exploring accumulating fact table | 7:17 | --- #### Week 7: Snowflake for Data Engineers ##### Description Snowflake is a highly popular cloud-based data warehouse that is ideal for beginners due to its simplicity and powerful features. In this course, you will learn how to set up Snowflake, load and process data, and create visualizations. The course covers both SQL and Python methods for managing data within Snowflake, and provides hands-on experience with connecting Snowflake to other tools such as PowerBI. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/snowflake-for-data-engineers) ##### Detailed Course Curriculum | Module | Lesson | Duration | |--------|--------|----------| | **Introduction** | Snowflake basics | 4:16 | | | Data Warehousing basics | 4:13 | | | How Snowflake fits into data platforms | 3:14 | | **Setup** | Snowflake Account setup | 4:24 | | | Creating your warehouse & UI overview | 4:15 | | **Loading CSVs from your PC** | Our dataset & goals | 3:01 | | | Setup Snowflake database | 10:29 | | | Preparing the upload file | 8:31 | | | Using internal stages with SnowSQL | 12:37 | | | Splitting a data table into two tables | 6:38 | | **Visualizing Data** | Creating a visualization worksheet | 7:08 | | | Creating a dashboard | 5:23 | | | Connect PowerBI to Snowflake | 6:03 | | | Query data with Python | 7:35 | | **Automation** | Create import task | 9:18 | | | Create table refresh task | 3:40 | | | Test our pipeline | 3:14 | | **AWS S3 Integration** | Working with external stages for AWS S3 | 10:20 | | | Implementing snowpipe with S3 | 6:19 | --- #### Week 8: dbt for Data Engineers ##### Description This course introduces dbt (Data Build Tool), a SQL-first transformation workflow that allows you to transform, test, and document data directly within your data warehouse. You will learn how to set up dbt, connect it with Snowflake, create data pipelines, and implement advanced features like CI/CD and documentation generation. This training is ideal for data engineers looking to build trusted datasets for reporting, machine learning, and operational workflows. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/dbt-for-data-engineers) ##### Detailed Course Curriculum | Module | Lesson | Duration | |--------|--------|----------| | **dbt Introduction & Setup** | Modern data experience | 5:42 | | | Introduction to dbt | 4:38 | | | Goals of this course | 4:50 | | | Snowflake preparation | 7:29 | | | Loading data into Snowflake | 4:48 | | | Setup dbt Core | 9:35 | | | Preparing the GitHub repository | 3:32 | | **Working with dbt-Core** | dbt models & materialization explained | 6:16 | | | Creating your first SQL model | 5:48 | | | Working with custom schemas | 5:28 | | | Creating your first Python model | 4:35 | | | dbt sources | 1:55 | | | Configuring sources | 4:03 | | | Working with seed files | 4:20 | | **Tests in dbt** | Generic tests | 3:19 | | | Tests with Great Expectations | 3:25 | | | Writing custom generic tests | 2:49 | | **Working with dbt-Cloud** | dbt cloud setup | 7:25 | | | Creating dbt jobs | 5:14 | | | CI/CD automation with dbt cloud and GitHub | 10:52 | | | Documentation in dbt | 7:38 | --- #### Week 9: Apache Airflow Workflow Orchestration ##### Description Airflow is a platform-independent workflow orchestration tool that offers many possibilities to create and monitor stream and batch pipeline processes. It supports complex, multi-stage processes across major platforms and tools in the data engineering world, such as AWS or Google Cloud. Airflow is not only great for planning and organizing your processes but also provides robust monitoring capabilities, allowing you to keep track of data workflows and troubleshoot effectively. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/learn-apache-airflow) ##### Detailed Course Curriculum | Module | Lesson | Duration | |--------|--------|----------| | **Airflow Workflow Orchestration** | Airflow Usage | 3:19 | | **Airflow Fundamental Concepts** | Fundamental Concepts | 2:47 | | | Airflow Architecture | 3:09 | | | Example Pipelines | 4:49 | | | Spotlight 3rd Party Operators | 2:17 | | | Airflow XComs | 4:32 | | **Hands-On Setup** | Project Setup | 1:43 | | | Docker Setup Explained | 2:06 | | | Docker Compose & Starting Containers | 4:23 | | | Checking Services | 1:48 | | | Setup WeatherAPI | 1:33 | | | Setup Postgres DB | 1:58 | | **Learn Creating DAGs** | Airflow Webinterface | 4:37 | | | Creating DAG With Airflow 2.0 | 9:46 | | | Running our DAG | 4:15 | | | Creating DAG With TaskflowAPI | 6:59 | | | Getting Data From the API With SimpleHTTPOperator | 3:38 | | | Writing into Postgres | 4:12 | | | Parallel Processing | 4:15 | --- #### Week 10 & 11: End-to-End Project on AWS, Azure, or GCP ##### Important: Choose One Project Participants need to select **one** of the following cloud platforms to complete their end-to-end data engineering project. It is not necessary to complete all three projects. ##### AWS Project Introduction The AWS project is designed for those who want to get started with cloud platforms, particularly with Amazon Web Services, the leading platform in data processing. This project will guide you through setting up an end-to-end data engineering pipeline using AWS tools like Lambda, API Gateway, Glue, Redshift, Kinesis, and DynamoDB. You will work with an e-commerce dataset, learn data modeling, and implement both stream and batch processing pipelines. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-engineering-on-aws) ##### Detailed AWS Project Curriculum | Module | Lesson | Duration | |--------|--------|----------| | | Data Engineering | 4:15 | | | Data Science Platform | 5:20 | | **The Dataset** | Data Types You Encounter | 3:03 | | | What Is A Good Dataset | 2:54 | | | The Dataset We Use | 3:16 | | | Defining The Purpose | 6:27 | | | Relational Storage Possibilities | 3:46 | | | NoSQL Storage Possibilities | 6:28 | | **Platform Design** | Selecting The Tools | 3:49 | | | Client | 3:05 | | | Connect | 1:18 | | | Buffer | 1:28 | | | Process | 2:42 | | | Store | 3:41 | | | Visualize | 3:00 | | **Data Pipelines** | Data Ingestion Pipeline | 3:00 | | | Stream To Raw Storage Pipeline | 2:19 | | | Stream To DynamoDB Pipeline | 3:09 | | | Visualization API Pipeline | 2:56 | | | Visualization Redshift Data Warehouse Pipeline | 5:29 | | | Batch Processing Pipeline | 3:19 | | **AWS Basics** | Create An AWS Account | 1:58 | | | Things To Keep In Mind | 2:45 | | | IAM Identity & Access Management | 4:06 | | | Logging | 2:22 | | | AWS Python API Boto3 | 2:57 | | **Data Ingestion Pipeline** | Development Environment | 4:02 | | | Create Lambda for API | 2:33 | | | Create API Gateway | 8:30 | | | Setup Kinesis | 1:38 | | | Setup IAM for API | 5:00 | | | Create Ingestion Pipeline (Code) | 6:09 | | | Create Script to Send Data | 5:46 | | | Test The Pipeline | 4:53 | | **Stream To Raw S3 Storage Pipeline** | Setup S3 Bucket | 3:42 | | | Configure IAM For S3 | 3:21 | | | Create Lambda For S3 Insert | 7:16 | | | Test The Pipeline | 4:01 | | **Stream To DynamoDB Pipeline** | Setup DynamoDB | 9:00 | | | Setup IAM For DynamoDB Stream | 3:36 | | | Create DynamoDB Lambda | 9:20 | | **Visualization API** | Create API & Lambda For Access | 6:10 | | | Test The API | 4:47 | | **Visualization Pipeline Redshift Data Warehouse** | Setup Redshift Data Warehouse | 8:08 | | | Security Group For Firehose | 3:12 | | | Create Redshift Tables | 5:51 | | | S3 Bucket & jsonpaths.json | 3:02 | | | Configure Firehose | 7:58 | | | Debug Redshift Streaming | 7:43 | | | Bug-fixing | 5:58 | | | Power BI | 12:16 | | **Batch Processing Pipeline** | AWS Glue Basics | 5:14 | | | Glue Crawlers | 13:09 | | | Glue Jobs | 13:43 | | | Redshift Insert & Debugging | 7:16 | --- ##### Azure Project Introduction The Azure project is designed for those who want to build a streaming data pipeline using Microsoft Azure's robust cloud platform. This project introduces you to Azure services such as APIM, Blob Storage, Azure Functions, Cosmos DB, and Power BI. You will gain practical experience by building a pipeline that ingests, processes, stores, and visualizes data, using Python and Visual Studio Code. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/build-streaming-data-pipelines-in-azure) ##### Detailed Azure Project Curriculum | Module | Lesson | Duration | |--------|--------|----------| | **Project Introduction** | Data Engineering in Azure - Streaming Data Pipelines | 2:43 | | **Datasets and Local Preprocessing** | Introduction to Datasets and Local Preprocessing | 7:06 | | | Deploying Code on Visual Studio to Docker Containers | 5:27 | | **Azure Functions and Blob Storage** | Develop Azure Functions via Python and VS Code | 5:52 | | | Deploy Azure Function to Azure Function App and Test It | 6:26 | | | Integrate Azure Function with Blob Storage via Bindings | 4:58 | | **Add Azure Function to Azure API Management (APIM)** | Expose Azure Function as a Backend | 7:05 | | | Securely Store Secrets in Azure Key Vault | 4:41 | | | Add Basic Authentication in API Management | 4:35 | | | Test APIM and Imported Azure Function via Local Python Program | 2:34 | | **Create and Combine Event Hubs, Azure Function, and Cosmos DB** | Create Event Hubs and Test Capture Events Feature | 6:59 | | | Modify Existing Azure Function to Include Event Hubs Binding | 6:42 | | **Write Tweets to Cosmos DB (Core SQL) from Event Hub** | Create a Cosmos DB (Core SQL) | 9:03 | | | Create a New Azure Function that Writes Messages to Cosmos DB | 9:03 | | **Connect Power BI Desktop to Your Cosmos DB** | Connect Power BI Desktop via Connector and Create a Dashboard | 6:32 | --- ##### GCP Project Introduction The GCP project is designed for those who want to learn how to build, manage, and optimize data pipelines on Google Cloud Platform. This project focuses on building an end-to-end pipeline that extracts data from an external weather API, processes it through GCP's data tools, and visualizes the results using Looker Studio. This project offers practical, hands-on experience with tools like Cloud SQL, Compute Engine, Cloud Functions, Pub/Sub, and Looker Studio. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-engineering-on-gcp) ##### Detailed GCP Project Curriculum | Module | Lesson | Duration | |--------|--------|----------| | **Introduction** | Introduction | 1:13 | | | GitHub & the Team | 1:30 | | **Data & Goals** | Architecture of the Project | 3:19 | | | Introduction to Weather API | 2:18 | | | Setup Google Cloud Account | 2:12 | | **Project Setup** | Creating the Project | 2:35 | | | Enabling the Required APIs | 1:34 | | | Configure Scheduling | 2:20 | | **Pipeline Creation - Extract from API** | Setup VM for Database Interaction | 2:53 | | | Setup MySQL Database | 2:16 | | | Setup VM Client and Create Database | 2:46 | | | Creating Pub/Sub Message Queue | 1:41 | | | Create Cloud Function to Pull Data from API | 4:17 | | | Explanation of Code to Pull from API | 4:20 | | **Pipeline Creation - Write to Database** | Create Function to Write to Database | 7:47 | | | Explanation of Code to Write Data to Database | 5:56 | | | Testing the Function | 5:51 | | | Create Function Write Data to DB - Pull | 3:53 | | | Explanation Code Write Data to DB - Pull | 4:33 | | **Visualization** | Setup Looker Studio and Create Bubble Chart | 2:20 | | | Setup Looker Studio and Create Time Series Chart | 1:57 | | | Pipeline Monitoring | 6:20 | --- ##### What’s Next? After completing this roadmap, you’ll have the confidence and skills to not just analyze data but to engineer and optimize it like a pro! Explore advanced topics, start contributing to projects, and showcase your new skills to potential employers. ### Roadmap for Data Analysts Start this roadmap at my Academy: [Start Today](https://learndataengineering.com/p/data-engineering-for-data-analysts) #### Go Beyond SQL and Learn How to Build, Automate, and Optimize Data Pipelines Like an Engineer #### Who Is This 10 Week Roadmap For? - Data Analysts who want to understand the full data lifecycle - Those looking to move beyond SQL and build real data pipelines - Professionals seeking hands-on, practical experience to boost their resumes - Anyone wanting to stay competitive in the job market #### What You’ll Achieve This roadmap provides a step-by-step approach to mastering data engineering skills. You'll start with Python and data modeling, move on to building pipelines, work with cloud platforms, and finally automate workflows using industry-standard tools. ![Building blocks of your curriculum](/images/Roadmap-From-Data-Analyst-to-Engineer.jpg) --- #### Learning Goals | Goal | Description | | ----------- | --------------------------------------------------- | | **Goal #1** | Master Python & Relational Data Modeling | | **Goal #2** | Build Your First ETL Pipeline on AWS (or Azure/GCP) | | **Goal #3** | Gain Hands-On Experience with Snowflake & dbt | | **Goal #4** | Connect AWS and Snowflake | | **Goal #5** | Automate Your Data Pipeline with Airflow | --- #### 10-Week Learning Roadmap | Week | Topic | Key Learning Outcomes | | --------------- | ----------------------------------------- | ------------------------------------------------------------------------------- | | **Week 1** | Introduction to Data Engineering & Python | Understand core concepts of data engineering and Python programming basics | | **Week 2** | Platform & Pipeline Design | Learn how to design effective data platforms and pipelines | | **Week 3** | Relational Data Modeling | Develop skills in creating relational data models for structured data | | **Week 4** | Dimensional Data Modeling | Master the techniques of dimensional modeling for analytics and reporting | | **Week 5** | Docker Fundamentals & APIs | Get hands-on with containerization using Docker and working with APIs | | **Week 8** | Working with Snowflake | Gain practical experience using Snowflake as a data warehouse | | **Week 9** | Transforming Data With dbt | Learn to transform and model data efficiently using dbt | | **Week 10** | Pipeline Orchestration with Airflow | Automate and manage data workflows using Apache Airflow | --- #### Detailed Weekly Content #### Week 1: Introduction to Data Engineering & Python If you want to take your data engineering skills to the next level, you are in the right place. Python has become the go-to language for data analysis and machine learning, and with our training, you will learn how to successfully use Python to build robust data pipelines and manipulate data efficiently. This comprehensive training program is designed for data engineers of all levels. Whether you are just starting out in data engineering or you are an experienced engineer looking to expand your skill set, our Python for Data Engineers training will give you the tools you need to excel in your field. At the end of the training, you will have a strong foundation in Python and data engineering and be ready to tackle complex data engineering projects with ease. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/python-for-data-engineers) ##### Course Curriculum | Lesson | Duration | |--------|----------| | Classes | 4:37 | | Modules | 3:06 | | Exception Handling | 8:55 | | Logging | 5:12 | | Datetime | 8:04 | | JSON | 9:54 | | JSON Validation | 15:10 | | UnitTesting | 16:44 | | Pandas: Intro & data types | 8:43 | | Pandas: Appending & Merging DataFrames | 7:49 | | Pandas: Normalizing & Lambdas | 4:12 | | Pandas: Pivot & Parquet write, read | 6:17 | | Pandas: Melting & JSON normalization | 8:15 | | Numpy | 4:47 | | Requests (Working with APIs) | 11:15 | | Working with Databases: Setup | 4:06 | | Working with Databases: Tables, bulk load, queries | 8:12 | --- #### Week 2: Platform & Pipeline Design ##### Description Data pipelines are the number one thing within the Data Science platform. Without them, data ingestion or machine learning processing, for example, would not be possible. This 110-minute long training will help you understand how to create stream and batch processing pipelines as well as machine learning pipelines by going through some of the most essential basics - complemented by templates and examples for useful cloud computing platforms. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-pipeline-design) ##### Course Curriculum | Lesson | Duration | |--------|----------| | Platform Blueprint & End to End Pipeline Example | 10:11 | | Data Engineering Tools Guide | 2:44 | | End to End Pipeline Example | 6:18 | | Push Ingestion Pipelines | 3:42 | | Pull Ingestion Pipelines | 3:34 | | Batch Pipelines | 3:07 | | Streaming Pipelines | 3:34 | | Stream Analytics | 2:26 | | Lambda Architecture | 4:02 | | Visualization Pipelines | 3:47 | | Visualization with Hive & Spark on Hadoop | 6:21 | | Visualization Data via Spark Thrift Server | 3:27 | --- #### Week 3: Relational Data Modeling ##### Description Relational modeling is often used for building transactional databases. You might say, 'But I'm not planning to become a back-end engineer'. Apart from knowing how to move data, you should also know how to store it effectively which involves designing a scalable data model optimized to drive faster query response time and efficiently retrieve data. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/relational-data-modeling) ##### Course Curriculum | Lesson | Duration | |--------|----------| | Relational Data Models History | 3:16 | | Installing MySQL Server and MySQL Workbench | 8:04 | | MySQL Workbench Introduction | 4:36 | | The Design Process Explained | 4:14 | | Discover the Entities | 10:24 | | Discover the Attributes | 13:09 | | Define Entity Relationships and Normalize the Data | 11:19 | | Identifying vs Non-identifying Relationships | 2:01 | | Resolve Many-to-Many Relationships | 4:00 | | Resolve One-to-Many Relationships | 2:34 | | Resolve One-to-One Relationships | 1:45 | | Create ER Diagram Using Workbench | 19:46 | | Create a Physical Data Model | 4:13 | | Populate MySQL DB with Data from .xls File | 15:13 | | Course Conclusion | 1:28 | --- #### Week 4: Dimensional Data Modeling ##### Description In today’s data-driven world, efficient data organization is key to enabling insightful analysis and reporting. Dimensional data modeling is a crucial technique that helps structure your data for faster querying and better decision-making. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-modeling-3-dimensional-data-modeling) ##### Course Curriculum | Lesson | Duration | |--------|----------| | Intro to Data Warehousing | 6:42 | | Approaches to Building a Data Warehouse | 5:20 | | Dimension Tables Explained | 5:34 | | Fact Tables Explained | 6:34 | | Identifying Dimensions | 3:16 | | What is DuckDB | 5:58 | | First DuckDB Hands-on | 2:20 | | Creating Tables in DuckDB | 2:40 | | Installing DBeaver | 6:49 | | Exploring SCD0 and SCD1 | 19:57 | | Exploring SCD2 | 13:52 | | Exploring Transaction Fact Table | 6:28 | | Exploring Accumulating Fact Table | 7:17 | | Course Conclusion | 0:52 | --- #### Week 5: Docker Fundamentals & APIs ##### Description Week 5 covers two crucial topics: containerization using Docker and building APIs with FastAPI. Docker is essential for creating lightweight, self-sustained containers, while APIs are the backbone of data platforms. Check out Docker Fundamentals in my Academy: [Learn More](https://learndataengineering.com/p/docker-fundamentals) Check out Building APIs with FastAPI in my Academy: [Learn More](https://learndataengineering.com/p/apis-with-fastapi-course) ##### Course Curriculum ##### Docker Fundamentals | Lesson | Duration | |--------|----------| | Docker vs Virtual Machines | 6:23 | | Docker Terminology | 5:56 | | Installing Docker Desktop | 4:09 | | Pulling Images & Running Containers | 6:34 | | Docker Compose | 6:34 | | Build & Run Simple Image | 6:28 | | Build Image with Dependencies | 5:05 | | Using DockerHub Image Registry | 4:24 | | Image Layers & Security Best Practices | 7:55 | | Managing Docker with Portainer | 4:04 | ##### Building APIs with FastAPI | Lesson | Duration | |--------|----------| | What are APIs? | 8:29 | | Hosting vs Using APIs | 4:08 | | HTTP Methods & Media Types | 6:56 | | API Parameters & Response Codes | 9:40 | | Setting up FastAPI | 4:55 | | Creating APIs: POST, GET, PUT | 16:18 | | Testing APIs with Postman | 4:22 | | Deploying FastAPI with Docker | 6:01 | | API Security Best Practices | 3:48 | --- #### Week 6 & 7: End-to-End Project on AWS, Azure, or GCP ##### Important: Choose One Project Participants need to select **one** of the following cloud platforms to complete their end-to-end data engineering project. It is not necessary to complete all three projects. ##### AWS Project Introduction The AWS project is designed for those who want to get started with cloud platforms, particularly with Amazon Web Services, the leading platform in data processing. This project will guide you through setting up an end-to-end data engineering pipeline using AWS tools like Lambda, API Gateway, Glue, Redshift, Kinesis, and DynamoDB. You will work with an e-commerce dataset, learn data modeling, and implement both stream and batch processing pipelines. Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/data-engineering-on-aws) ##### Detailed AWS Project Curriculum | Module | Lesson | Duration | |--------|--------|----------| | | Data Engineering | 4:15 | | | Data Science Platform | 5:20 | | **The Dataset** | Data Types You Encounter | 3:03 | | | What Is A Good Dataset | 2:54 | | | The Dataset We Use | 3:16 | | | Defining The Purpose | 6:27 | | | Relational Storage Possibilities | 3:46 | | | NoSQL Storage Possibilities | 6:28 | | **Platform Design** | Selecting The Tools | 3:49 | | | Client | 3:05 | | | Connect | 1:18 | | | Buffer | 1:28 | | | Process | 2:42 | | | Store | 3:41 | | | Visualize | 3:00 | | **Data Pipelines** | Data Ingestion Pipeline | 3:00 | | | Stream To Raw Storage Pipeline | 2:19 | | | Stream To DynamoDB Pipeline | 3:09 | | | Visualization API Pipeline | 2:56 | | | Visualization Redshift Data Warehouse Pipeline | 5:29 | | | Batch Processing Pipeline | 3:19 | | **AWS Basics** | Create An AWS Account | 1:58 | | | Things To Keep In Mind | 2:45 | | | IAM Identity & Access Management | 4:06 | | | Logging | 2:22 | | | AWS Python API Boto3 | 2:57 | | **Data Ingestion Pipeline** | Development Environment | 4:02 | | | Create Lambda for API | 2:33 | | | Create API Gateway | 8:30 | | | Setup Kinesis | 1:38 | | | Setup IAM for API | 5:00 | | | Create Ingestion Pipeline (Code) | 6:09 | | | Create Script to Send Data | 5:46 | | | Test The Pipeline | 4:53 | | **Stream To Raw S3 Storage Pipeline** | Setup S3 Bucket | 3:42 | | | Configure IAM For S3 | 3:21 | | | Create Lambda For S3 Insert | 7:16 | | | Test The Pipeline | 4:01 | | **Stream To DynamoDB Pipeline** | Setup DynamoDB | 9:00 | | | Setup IAM For DynamoDB Stream | 3:36 | | | Create DynamoDB Lambda | 9:20 | | **Visualization API** | Create API & Lambda For Access | 6:10 | | | Test The API | 4:47 | | **Visualization Pipeline Redshift Data Warehouse** | Setup Redshift Data Warehouse | 8:08 | | | Security Group For Firehose | 3:12 | | | Create Redshift Tables | 5:51 | | | S3 Bucket & jsonpaths.json | 3:02 | | | Configure Firehose | 7:58 | | | Debug Redshift Streaming | 7:43 | | | Bug-fixing | 5:58 | | | Power BI | 12:16 | | **Batch Processing Pipeline** | AWS Glue Basics | 5:14 | | | Glue Crawlers | 13:09 | | | Glue Jobs | 13:43 | | | Redshift Insert & Debugging | 7:16 | --- ##### Azure Project Introduction The Azure project is designed for those who want to build a streaming data pipeline using Microsoft Azure's robust cloud platform. This project introduces you to Azure services such as APIM, Blob Storage, Azure Functions, Cosmos DB, and Power BI. You will gain practical experience by building a pipeline that ingests, processes, stores, and visualizes data, using Python and Visual Studio Code. Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/build-streaming-data-pipelines-in-azure) ##### Detailed Azure Project Curriculum | Module | Lesson | Duration | |--------|--------|----------| | **Project Introduction** | Data Engineering in Azure - Streaming Data Pipelines | 2:43 | | **Datasets and Local Preprocessing** | Introduction to Datasets and Local Preprocessing | 7:06 | | | Deploying Code on Visual Studio to Docker Containers | 5:27 | | **Azure Functions and Blob Storage** | Develop Azure Functions via Python and VS Code | 5:52 | | | Deploy Azure Function to Azure Function App and Test It | 6:26 | | | Integrate Azure Function with Blob Storage via Bindings | 4:58 | | **Add Azure Function to Azure API Management (APIM)** | Expose Azure Function as a Backend | 7:05 | | | Securely Store Secrets in Azure Key Vault | 4:41 | | | Add Basic Authentication in API Management | 4:35 | | | Test APIM and Imported Azure Function via Local Python Program | 2:34 | | **Create and Combine Event Hubs, Azure Function, and Cosmos DB** | Create Event Hubs and Test Capture Events Feature | 6:59 | | | Modify Existing Azure Function to Include Event Hubs Binding | 6:42 | | **Write Tweets to Cosmos DB (Core SQL) from Event Hub** | Create a Cosmos DB (Core SQL) | 9:03 | | | Create a New Azure Function that Writes Messages to Cosmos DB | 9:03 | | **Connect Power BI Desktop to Your Cosmos DB** | Connect Power BI Desktop via Connector and Create a Dashboard | 6:32 | --- ##### GCP Project Introduction The GCP project is designed for those who want to learn how to build, manage, and optimize data pipelines on Google Cloud Platform. This project focuses on building an end-to-end pipeline that extracts data from an external weather API, processes it through GCP's data tools, and visualizes the results using Looker Studio. This project offers practical, hands-on experience with tools like Cloud SQL, Compute Engine, Cloud Functions, Pub/Sub, and Looker Studio. Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/data-engineering-on-gcp) ##### Detailed GCP Project Curriculum | Module | Lesson | Duration | |--------|--------|----------| | **Introduction** | Introduction | 1:13 | | | GitHub & the Team | 1:30 | | **Data & Goals** | Architecture of the Project | 3:19 | | | Introduction to Weather API | 2:18 | | | Setup Google Cloud Account | 2:12 | | **Project Setup** | Creating the Project | 2:35 | | | Enabling the Required APIs | 1:34 | | | Configure Scheduling | 2:20 | | **Pipeline Creation - Extract from API** | Setup VM for Database Interaction | 2:53 | | | Setup MySQL Database | 2:16 | | | Setup VM Client and Create Database | 2:46 | | | Creating Pub/Sub Message Queue | 1:41 | | | Create Cloud Function to Pull Data from API | 4:17 | | | Explanation of Code to Pull from API | 4:20 | | **Pipeline Creation - Write to Database** | Create Function to Write to Database | 7:47 | | | Explanation of Code to Write Data to Database | 5:56 | | | Testing the Function | 5:51 | | | Create Function Write Data to DB - Pull | 3:53 | | | Explanation Code Write Data to DB - Pull | 4:33 | | **Visualization** | Setup Looker Studio and Create Bubble Chart | 2:20 | | | Setup Looker Studio and Create Time Series Chart | 1:57 | | | Pipeline Monitoring | 6:20 | --- #### Week 8: Working with Snowflake ##### Description Currently, Snowflake is the analytics store/data warehouse everybody is talking about. It is a 100% cloud-based platform that offers many advantages, including flexible data access and the ability to scale services as needed. Snowflake is widely used in the industry, and learning it will enhance your data engineering skill set. This training covers everything from the basics of Snowflake and data warehousing to advanced integration and automation techniques. By the end, you will have the knowledge to prepare, integrate, manage data on Snowflake, and connect other systems to the platform. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/snowflake-for-data-engineers) ##### Course Curriculum | Module | Lesson | Duration | |--------|--------|----------| | | Snowflake Basics | 4:16 | | | Data Warehousing Basics | 4:13 | | | How Snowflake Fits into Data Platforms | 3:14 | | **Setup** | Snowflake Account Setup | 4:24 | | | Creating Your Warehouse & UI Overview | 4:15 | | **Loading CSVs from Your PC** | Our Dataset & Goals | 3:01 | | | Setup Snowflake Database | 10:29 | | | Preparing the Upload File | 8:31 | | | Using Internal Stages with SnowSQL | 12:37 | | | Splitting a Data Table into Two Tables | 6:38 | | **Visualizing Data** | Creating a Visualization Worksheet | 7:08 | | | Creating a Dashboard | 5:23 | | | Connect PowerBI to Snowflake | 6:03 | | | Query Data with Python | 7:35 | | **Automation** | Create Import Task | 9:18 | | | Create Table Refresh Task | 3:40 | | | Test Our Pipeline | 3:14 | | **AWS S3 Integration** | Working with External Stages for AWS S3 | 10:20 | | | Implementing Snowpipe with S3 | 6:19 | --- #### Week 9: Transforming Data With dbt ##### Description dbt is a SQL-first transformation workflow that simplifies the process of transforming, testing, and documenting data. It allows teams to work directly within the data warehouse, creating trusted datasets for reporting, machine learning, and operational workflows. This training is the perfect starting point to get hands-on experience with dbt Core, dbt Cloud, and Snowflake. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/dbt-for-data-engineers) ##### Course Curriculum | Module | Lesson | Duration | |--------|--------|----------| | **dbt Introduction & Setup** | Modern Data Experience | 5:42 | | | Introduction to dbt | 4:38 | | | Goals of this Course | 4:50 | | | Snowflake Preparation | 7:29 | | | Loading Data into Snowflake | 4:48 | | | Setup dbt Core | 9:35 | | | Preparing the GitHub Repository | 3:32 | | **Working with dbt-Core** | dbt Models & Materialization Explained | 6:16 | | | Creating Your First SQL Model | 5:48 | | | Working with Custom Schemas | 5:28 | | | Creating Your First Python Model | 4:35 | | | dbt Sources | 1:55 | | | Configuring Sources | 4:03 | | | Working with Seed Files | 4:20 | | **Tests in dbt** | Generic Tests | 3:19 | | | Tests with Great Expectations | 3:25 | | | Writing Custom Generic Tests | 2:49 | | **Working with dbt-Cloud** | dbt Cloud Setup | 7:25 | | | Creating dbt Jobs | 5:14 | | | CI/CD Automation with dbt Cloud and GitHub | 10:52 | | | Documentation in dbt | 7:38 | --- #### Week 10: Pipeline Orchestration with Airflow ##### Description Apache Airflow is a powerful, platform-independent workflow orchestration tool widely used in the data engineering world. It allows you to create and monitor both stream and batch pipeline processes with ease. Airflow supports integration with major platforms and tools such as AWS, Google Cloud, and many more. Airflow not only helps in planning and organizing workflows but also offers robust monitoring features, allowing you to troubleshoot and maintain complex ETL pipelines effectively. As one of the most popular tools for workflow orchestration, mastering Airflow is highly valuable for data engineers. Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/learn-apache-airflow) ##### Course Curriculum | Module | Lesson | Duration | |--------|--------|----------| | **Airflow Workflow Orchestration** | Airflow Usage | 3:19 | | **Airflow Fundamental Concepts** | Fundamental Concepts | 2:47 | | | Airflow Architecture | 3:09 | | | Example Pipelines | 4:49 | | | Spotlight 3rd Party Operators | 2:17 | | | Airflow XComs | 4:32 | | **Hands-On Setup** | Project Setup | 1:43 | | | Docker Setup Explained | 2:06 | | | Docker Compose & Starting Containers | 4:23 | | | Checking Services | 1:48 | | | Setup WeatherAPI | 1:33 | | | Setup Postgres DB | 1:58 | | **Learn Creating DAGs** | Airflow Webinterface | 4:37 | | | Creating DAG With Airflow 2.0 | 9:46 | | | Running our DAG | 4:15 | | | Creating DAG With TaskflowAPI | 6:59 | | | Getting Data From the API With SimpleHTTPOperator | 3:38 | | | Writing into Postgres | 4:12 | | | Parallel Processing | 4:15 | | **Recap** | Recap & Outlook | 4:38 | --- #### What’s Next? After completing this roadmap, you’ll have the confidence and skills to not just analyze data but to engineer and optimize it like a pro! Explore advanced topics, start contributing to projects, and showcase your new skills to potential employers. ### Roadmap for Data Scientists #### 14-Week Data Engineering Roadmap for Data Scientists #### From Notebooks to Production: Build, Deploy, and Scale Your ML Workflows #### Start this roadmap at my Academy: [Start Today](https://learndataengineering.com/p/data-engineering-for-data-scientists) --- #### Who Is This Roadmap For? - Data Scientists who want to deploy and maintain ML models in production - ML practitioners struggling with real-time data, CI/CD, and orchestration - Data professionals looking to expand their engineering toolkit - Anyone ready to go beyond notebooks and automate their ML workflows --- #### What You’ll Achieve This roadmap provides a step-by-step approach to gaining production-grade data engineering skills. You'll start with pipelines and containerization, move on to deployment and orchestration, and finish with big data and monitoring. ![Building blocks of your curriculum](/images/Roadmap-Data-Engineering-For-Data-Scientists.jpg) #### Learning Goals | Goal # | Description | | ------- | -------------------------------------------------- | | Goal #1 | Build an End-to-End ML Pipeline on AWS | | Goal #2 | Add CI/CD & Containerization to Your Platform | | Goal #3 | Implement the Lakehouse Architecture in AWS or GCP | | Goal #4 | Orchestrate Your Pipelines with Airflow | | Goal #5 | Process Big Data with Apache Spark & Streaming | | Goal #6 | Analyze Your ML Training Logs with Elasticsearch | --- #### 14-Week Learning Roadmap | Week | Topic | | ---------- | -------------------------------------------- | | Week 1 | Platform & Pipeline Design | | Week 2 | Docker Fundamentals | | Week 3 | Relational Data Modeling | | Week 4 | Working & Designing APIs | | Week 5 & 6 | ML & Containerization on AWS | | Week 7 | ETL & CI/CD on AWS | | Week 8 | Building a Lakehouse on AWS or GCP | | Week 9 | Orchestrate with Airflow | | Week 10 | Pre-Process Data with Apache Spark | | Week 11-13 | Build a Streaming Pipeline (AWS, Azure, GCP) | | Week 14 | Analyze Training Logs with Elasticsearch | --- #### Week 1: Platform & Pipeline Design ##### Description Data pipelines are the foundation of any data platform. In this 110-minute training, you'll learn about stream, batch, and ML pipelines. You'll also explore platform blueprints, architecture components, and Lambda architecture. **Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-pipeline-design)** ##### Course Curriculum | Lesson | Duration | | ------------------------------------------------ | ----------- | | Platform Blueprint & End to End Pipeline Example | 10:11 | | Data Engineering Tools Guide | 2:44 | | End to End Pipeline Example | 6:18 | | Push Ingestion Pipelines | 3:42 | | Pull Ingestion Pipelines | 3:34 | | Batch Pipelines | 3:07 | | Streaming Pipelines | 3:34 | | Stream Analytics | 2:26 | | Lambda Architecture | 4:02 | | Visualization Pipelines | 3:47 | | Visualization with Hive & Spark on Hadoop | 6:21 | | Visualization Data via Spark Thrift Server | 3:27 | | Platform Examples (AWS, Azure, GCP, Hadoop) | Slides Only | --- #### Week 2: Docker Fundamentals ##### Description Docker is the go-to container platform for engineers. This training covers key concepts, hands-on Docker usage, building and running containers, and how Docker fits into production workflows. **Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/docker-fundamentals)** ##### Course Curriculum | Lesson | Duration | | ----------------------------------- | -------- | | Docker vs Virtual Machines | 6:23 | | Docker Terminology | 5:56 | | Installing Docker Desktop | 4:09 | | Pulling Images & Running Containers | 6:34 | | CLI Cheat Sheet | 3:38 | | Docker Compose Explained | 6:34 | | Build & Run Hello World Image | 6:28 | | Build Image with Dependencies | 5:05 | | Using DockerHub | 4:24 | | Image Layers | 7:55 | | Deployment in Production | 5:47 | | Security Best Practices | 4:09 | | Managing Docker with Portainer | 4:04 | --- #### Week 3: Relational Data Modeling ##### Description Learn how to design efficient and scalable relational models. You'll go through conceptual to physical modeling and normalize your schema. You'll use MySQL and MySQL Workbench for hands-on practice. **Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/relational-data-modeling)** ##### Course Curriculum | Lesson | Duration | | -------------------------------- | -------- | | History of Relational Models | 3:16 | | Installing MySQL & Workbench | 8:04 | | Workbench Introduction | 4:36 | | The Design Process Explained | 4:14 | | Discover Entities | 10:24 | | Discover Attributes | 13:09 | | Normalize & Define Relationships | 11:19 | | Identifying vs Non-identifying | 2:01 | | Resolve Many-to-Many | 4:00 | | Resolve One-to-Many | 2:34 | | Resolve One-to-One | 1:45 | | Create ER Diagram | 19:46 | | Create Physical Data Model | 4:13 | | Populate from XLS | 15:13 | | Course Conclusion | 1:28 | --- #### Week 4: Working & Designing APIs ##### Description APIs are the backbone of modern data platforms. You'll learn how to build and test APIs using FastAPI, design schemas, and deploy them in Docker. Postman and Docker are used for testing and deployment. **Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/apis-with-fastapi-course)** ##### Course Curriculum | Lesson | Duration | | ----------------------------- | -------- | | What are APIs? | 8:29 | | Hosting vs Using APIs | 4:08 | | HTTP Methods & Media Types | 6:56 | | Response Codes & Parameters | 9:40 | | FastAPI Setup | 4:55 | | POST, GET, PUT API Methods | 16:18 | | Testing with Postman | 4:22 | | Deploying FastAPI with Docker | 6:01 | | API Security Best Practices | 3:48 | --- #### Week 5 & 6: ML & Containerization on AWS ##### Description This hands-on project teaches you how to build a real-time ML pipeline on AWS. You'll pull data from the Twitter API (or The Guardian API), apply sentiment analysis with NLTK in a Lambda function, store results in a Postgres database via RDS, and build a Streamlit dashboard. Finally, you’ll containerize and deploy the dashboard using AWS ECS and ECR. **Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/ml-on-aws)** ##### Course Curriculum | Lesson | Duration | | -------------------------------------------------- | -------- | | Introduction | 2:38 | | Project Architecture Explained | 2:06 | | RDS Setup | 2:37 | | VPC Inbound Rules | 2:12 | | PG Admin Installation & S3 Config | 4:05 | | Lambda Intro & IAM Setup | 3:11 | | Create Lambda Function | 1:24 | | Lambda Code Explained | 8:22 | | Insert Code Into Lambda | 0:56 | | Add Layers from Klayers | 5:32 | | Create Custom Layers | 4:40 | | Test Lambda & Set Env Variables | 4:53 | | Schedule Lambda with EventBridge | 3:15 | | Setup Virtual Conda Environment | 4:07 | | Install Dependencies with Poetry | 5:57 | | Streamlit App Code Walkthrough | 7:52 | | Setup ECR Container Registry | 1:52 | | AWS CLI Install & Login | 5:19 | | Dockerfile Build & Push | 2:52 | | Create ECS Fargate Cluster | 1:34 | | ECS Task Configuration & Deployment | 4:59 | | Fixing ECS Task | 5:14 | | Stop ECS Task | 0:59 | | Project Conclusion | 5:06 | --- #### Week 7: ETL & CI/CD on AWS ##### Description In this project, you'll build a lightweight ETL job that pulls data from a public weather API and writes it into a time series database. You’ll dockerize the job, schedule it using AWS Lambda and EventBridge, and visualize the data using Grafana. **Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/timeseries-etl-with-aws-tdengine-grafana)** ### Course Curriculum | Lesson | Duration | | -------------------------------------------- | -------- | | Quick Note from Andreas | 0:43 | | Project Introduction | 1:26 | | Setup of the Project | 2:52 | | Time Series Data Basics | 2:20 | | Big Pros of Time Series Databases | 2:06 | | About TDengine | 1:22 | | Setup Weather API | 1:04 | | Code Query API | 2:41 | | TDengine Setup | 3:04 | | Connect Python to TDengine | 1:50 | | Lambda Docker Container & Push to ECR | 1:55 | | AWS Setup | 1:36 | | Create Lambda Function Using Docker Image | 1:04 | | Schedule Function with EventBridge | 1:25 | | CloudWatch Lambda Events | 0:27 | | Grafana Setup | 3:01 | --- #### Week 8: Building a Lakehouse on AWS or GCP ##### Description This week, you’ll learn how to combine data lakes and warehouses into a Lakehouse architecture. You’ll implement a full data analytics stack using tools like S3, Athena, BigQuery, Glue, Quicksight, and Data Studio. **Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/modern-data-warehouses)** ##### Course Curriculum | Lesson | Duration | | -------------------------------------------------------- | -------- | | Introduction | 2:13 | | Data Science Platform Overview | 4:10 | | ETL & ELT in Warehouses | 6:22 | | Data Lake & Warehouse Integration | 3:29 | | GCP Pipelines Overview | 3:13 | | Cloud Storage & BigQuery Hands-on | 8:35 | | Create Dashboard in Data Studio | 7:33 | | GCP Recap & AWS Goals | 2:12 | | Upload Data to S3 | 2:12 | | Athena Manual Table Configuration | 3:48 | | Create Dashboard in Quicksight | 5:05 | | Athena via Glue Catalog | 3:29 | | Course Recap | 2:36 | | BONUS: Redshift Spectrum with S3 | 2:57 | --- #### Week 9: Orchestrate with Airflow ##### Description This training will guide you through installing and running Apache Airflow in Docker, creating DAGs, using the Taskflow API, and monitoring workflow execution. **Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/learn-apache-airflow)** ##### Course Curriculum | Lesson | Duration | | --------------------------------------------- | -------- | | Introduction | 1:36 | | Airflow Usage | 3:19 | | Fundamental Concepts | 2:47 | | Airflow Architecture | 3:09 | | Example Pipelines | 4:49 | | Spotlight on 3rd Party Operators | 2:17 | | Airflow XComs | 4:32 | | Project Setup | 1:43 | | Docker Setup Explained | 2:06 | | Docker Compose & Starting Containers | 4:23 | | Checking Services | 1:48 | | Weather API Setup | 1:33 | | Postgres DB Setup | 1:58 | | Airflow Web Interface | 4:37 | | Create DAG with Airflow 2.0 | 9:46 | | Run Your DAG | 4:15 | | Create DAG with Taskflow API | 6:59 | | Get Data via SimpleHTTP Operator | 3:38 | | Write to Postgres | 4:12 | | Parallel Processing | 4:15 | | Recap & Outlook | 4:38 | --- #### Week 10: Pre-Process Data with Apache Spark ##### Description This training introduces Apache Spark fundamentals, showing you how to process large datasets using Spark DataFrames, RDDs, and SparkSQL inside Docker and Jupyter Notebooks. **Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/learning-apache-spark-fundamentals)** ##### Course Curriculum | Lesson | Duration | | ------------------------------------- | -------- | | Introduction & Contents | 3:30 | | Vertical vs Horizontal Scaling | 3:55 | | What Spark Is Good For | 4:45 | | Driver, Context & Executors | 4:11 | | Cluster Types | 1:59 | | Client vs Cluster Deployment | 6:11 | | Where to Run Spark | 3:38 | | Tools in Spark Course | 2:35 | | Dataset Overview | 4:11 | | Docker Setup | 2:52 | | Jupyter Notebook Setup & Run | 5:31 | | RDDs | 3:57 | | DataFrames | 1:40 | | Transformations & Actions Overview | 2:59 | | Transformations | 2:22 | | Actions | 3:06 | | JSON Transformations | 9:52 | | Working with Schemas | 8:23 | | Working with DataFrames | 10:09 | | SparkSQL | 5:04 | | Working with RDDs | 12:52 | --- #### Week 11–13: Build a Streaming Pipeline on AWS, Azure, or GCP ##### Description In this 3-week section, you'll complete an end-to-end streaming data project on the cloud platform of your choice: AWS, Azure, or GCP. Each project teaches you how to ingest real-time data, process it, store it, and create visualizations. You only need to complete one of the following three options: --- ##### Option 1: Streaming Pipeline on AWS ##### Description You'll use AWS services like API Gateway, Kinesis, DynamoDB, Redshift, Lambda, Glue, and Power BI to create a complete streaming solution. You'll work with e-commerce data and build multiple ingestion and batch pipelines. **Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/data-engineering-on-aws)** ##### Course Curriculum | Lesson | Duration | | -------------------------------------------- | -------- | | Data Engineering | 4:15 | | Data Science Platform | 5:20 | | Dataset Introduction | 3:16 | | Relational Storage Possibilities | 3:46 | | NoSQL Storage Possibilities | 6:28 | | Platform Design & Pipeline Planning | 3:49 | | Client to Visualization Design | 3:00 | | Data Ingestion to Kinesis | 3:00 | | Stream to S3 and DynamoDB | 5:28 | | Visualization API & Redshift | 5:29 | | AWS Setup & IAM | 4:06 | | Create Lambda Functions | 2:33 | | Configure Firehose & Debugging | 7:43 | | Power BI Setup | 12:16 | | Glue Crawlers and Jobs | 26:52 | --- ##### Option 2: Streaming Pipeline on Azure ##### Description You’ll build a Twitter-like JSON stream pipeline using Azure Functions, Event Hub, Cosmos DB, and Power BI. You’ll learn how to set up API management, key vaults, and authentication. **Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/build-streaming-data-pipelines-in-azure)** #### Course Curriculum | Lesson | Duration | | ---------------------------------------------------- | -------- | | Project Introduction | 2:43 | | Local Preprocessing & Docker Setup | 7:06 | | Develop & Deploy Azure Functions | 5:52 | | Test Functions & Integrate with Blob Storage | 6:26 | | Add Functions to Azure API Management (APIM) | 7:05 | | Key Vault & Authentication | 4:41 | | Create Event Hubs and Bindings | 6:59 | | Write to Cosmos DB | 9:03 | | Power BI Connection and Dashboard Creation | 6:32 | --- ##### Option 3: Streaming Pipeline on GCP ##### Description This project shows how to extract weather data via API, stream it with Pub/Sub, write it into Cloud SQL, and visualize it with Looker Studio. You'll also learn function deployment and VM/database setup. **Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/data-engineering-on-gcp)** ##### Course Curriculum | Lesson | Duration | | --------------------------------------------------- | -------- | | Introduction & Setup | 2:43 | | Architecture & Weather API | 5:31 | | Enable APIs & Configure Scheduling | 4:00 | | Setup MySQL Database & Compute Engine | 4:40 | | Create Cloud Functions for Data Ingestion | 8:37 | | Use Pub/Sub for Messaging | 1:41 | | Write Data to Cloud SQL | 13:43 | | Test and Monitor Data Flow | 5:51 | | Setup Looker Studio & Build Dashboards | 4:17 | | Monitor Pipelines | 6:20 | --- ##### Week 14: Analyze Training Logs with Elasticsearch ##### Description Wrap up your roadmap by learning how to monitor pipelines using Elasticsearch. You’ll deploy Elasticsearch with Docker, send logs from your training pipelines, and visualize them in Kibana dashboards. **Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/log-analysis-with-elasticsearch)** ##### Course Curriculum | Lesson | Duration | | ------------------------------------------------ | -------- | | Course Introduction | 2:07 | | Elasticsearch vs Relational Databases | 5:43 | | ETL Log Analysis & Debugging | 3:54 | | Streaming Log Analysis & Debugging | 2:48 | | Solving Problems with Elasticsearch | 4:37 | | ELK Stack Overview | 2:03 | | Setup Limiting RAM & Environment Config | 4:26 | | Running Elasticsearch | 4:07 | | Elasticsearch APIs & Python Index Creation | 7:31 | | Write Logs (JSON) to Elasticsearch | 4:46 | | Create Kibana Visualizations & Dashboards | 9:27 | | Search Logs in Elasticsearch | 4:57 | | Course Recap | — | --- #### What’s Next? After 14 weeks, you’ll have built scalable, production-ready data pipelines and ML workflows. You can now explore more advanced projects, optimize performance, and contribute to production systems with confidence. Need help showcasing your skills or getting hired? Reach out to my coaching program! ### Roadmap for Software Engineers ![Building blocks of your curriculum](/images/Data-Engineering-Roadmap-for-Software-Engineers.jpg) if you're transitioning from a background in computer science or software engineering into data engineering, you're already equipped with a solid foundation. Your existing knowledge in coding, familiarity with SQL databases, understanding of computer networking, and experience with operating systems like Linux, provide you with a considerable advantage. These skills form the cornerstone of data engineering and can significantly streamline your learning curve as you embark on this new journey. Here's a refined roadmap, incorporating your prior expertise, to help you excel in data engineering: - **Deepen Your Python Skills:** Python is crucial in data engineering for processing and handling various data formats, such as APIs, CSV, and JSON. Given your coding background, focusing on Python for data engineering will enhance your ability to manipulate and process data effectively. - **Master Docker:** Docker is essential for deploying code and managing containers, streamlining the software distribution process. Your understanding of operating systems and networking will make mastering Docker more intuitive, as you'll appreciate the importance of containerization in today's development and deployment workflows. - **Platform and Pipeline Design:** Leverage your knowledge of computer networking and operating systems to grasp the architecture of data platforms. Understanding how to design data pipelines, including considerations for stream and batch processing, and emphasizing security, will be key. Your background will provide a solid foundation for understanding how different components integrate within a data platform. - **Choosing the Right Data Stores:** Dive into the specifics of data stores, understanding the nuances between transactional and analytical databases, and when to use relational vs. NoSQL vs. document stores vs. time-series databases. Your experience with SQL databases will serve as a valuable baseline for exploring these various data storage options. - **Explore Cloud Platforms:** Get hands-on with cloud services such as AWS, GCP, and Azure. Projects or courses that offer practical experience with these platforms will be invaluable. Your tasks might include building pipelines to process data from APIs, using message queues, or delving into data warehousing and lakes, capitalizing on your foundational skills. - **Optional Deep Dives:** For those interested in advanced data processing, exploring technologies like Spark or Kafka for stream processing can be enriching. Additionally, learning how to build APIs and work with MongoDB for document storage can open new avenues, especially through practical projects. - **Log Analysis and Data Observability:** Familiarize yourself with tools like Elasticsearch, Grafana, and InfluxDB to monitor and analyze your data pipelines effectively. This area leverages your comprehensive understanding of how systems communicate and operate, enhancing your ability to maintain and optimize data flows. As you embark on this path, remember that your journey is unique. Your existing knowledge not only serves as a strong foundation but also as a catalyst for accelerating your growth in the realm of data engineering. Keep leveraging your strengths, explore areas of interest deeply, and continually adapt to the evolving landscape of data technology. | Live Stream -> Data Engineering Roadmap for Computer Scientists / Developers |------------------| |In this live stream you'll find even more details how to read this roadmap for Data Scientists, why I chose these tools and why I think this is the right way to do it. | [Watch on YouTube](https://youtube.com/live/0e4WfIUixRw)| ## Data Engineers Skills Matrix ![Data Engineer Skills Matrix](/images/Data-Engineer-Skills-Matrix.jpg) If you're diving into the world of data engineering or looking to climb the ladder within this field, you're in for a treat with this enlightening YouTube video. Andreas kicks things off by introducing us to a very handy tool they've developed: the Data Engineering Skills Matrix. This isn't just any chart; it's a roadmap designed to navigate the complex landscape of data engineering roles, ranging from a Junior Data Engineer to the lofty heights of a Data Architect and Machine Learning Engineer. | Live Stream -> Data Engineering Skills Matrix |------------------| |In this live stream you'll find even more details how to read this skills matrix for Data Engineers. | [Watch on YouTube](https://youtube.com/live/5E0UiBy0Kwo)| Andreas takes us through the intricacies of this matrix, layer by layer. Starting with the basics, they discuss the minimum experience needed for each role. It's an eye-opener, especially when you see how experience requirements evolve from a beginner to senior levels. But it's not just about how many years you've spent in the field; it's about the skills you've honed during that time. ### Challenges & Responsibilities As the conversation progresses, Andreas delves into the core responsibilities and main tasks associated with each role. You'll learn what sets a Junior Data Engineer apart from a Senior Data Engineer, the unique challenges a Data Architect faces, and the critical skills a Machine Learning Engineer must possess. This part of the video is golden for anyone trying to understand where they fit in the data engineering ecosystem or plotting their next career move. ### SQL & Soft Skills Then there's the talk on SQL knowledge and its relevance across different roles. This segment sheds light on how foundational SQL is, irrespective of your position. But it's not just about technical skills; the video also emphasizes soft skills, like leadership and collaboration, painting a holistic picture of what it takes to succeed in data engineering. For those who love getting into the weeds, Andreas doesn't disappoint. They discuss software development skills, debugging, and even dive into how data engineers work with SQL and databases. This part is particularly insightful for understanding the technical depth required at various stages of your career. ### Q&A And here's the cherry on top: Andreas encourages interaction, inviting viewers to share their experiences and questions. This makes the video not just a one-way learning experience but a dynamic conversation that enriches everyone involved. ### Summary By the end of this video, you'll walk away with a clear understanding of the data engineering field's diverse roles. You'll know the skills needed to excel in each role and have a roadmap for your career progression. Whether you're a recent graduate looking to break into data engineering or a seasoned professional aiming for a senior position, Andreas's video is a must-watch. It's not just a lecture; it's a guide to navigating the exciting world of data engineering, tailored by someone who's taken the time to lay out the journey for you. ## How to Become a Senior Data Engineer Becoming a senior data engineer is a goal many in the tech industry aspire to. It's a role that demands a deep understanding of data architecture, advanced programming skills, and the ability to lead and communicate effectively within an organization. In this live stream series, I dove into what it takes to climb the ladder to a senior data engineering position. Here are the key takeaways. You can find the links to the videos and the shown images below. ### Understanding the Role The journey to becoming a senior data engineer starts with a clear understanding of what the role entails. Senior data engineers are responsible for designing, implementing, and maintaining an organization's data architecture. They ensure data accuracy, accessibility, and security, often taking the lead on complex projects that require advanced technical skills and strategic thinking. ### Key Skills and Knowledge Areas Based on insights from the live stream and consultations with industry experts, including GPT-3, here are the critical areas where aspiring senior data engineers should focus their development: - **Advanced Data Modeling and Architecture:** Mastery of data modeling techniques and architecture best practices is crucial. This includes understanding of dimensional and Data Vault modeling, as well as expertise in SQL and NoSQL databases. - **Big Data Technologies:** Familiarity with distributed computing frameworks (like Apache Spark), streaming technologies (such as Apache Kafka), and cloud-based big data technologies is essential. Advanced ETL Techniques: Skills in incremental loading, data merging, and transformation are vital for efficiently processing large datasets. - **Data Warehousing and Data Lake Implementation:** Building and maintaining scalable and performant data warehouses and lakes are fundamental responsibilities. - **Cloud Computing:** Proficiency in cloud services from AWS, Azure, or GCP, along with platforms like Snowflake and Databricks, is increasingly important. - **Programming and Scripting:** Advanced coding skills in languages relevant to data engineering, such as Python, Scala, or Java, are non-negotiable. - **Data Governance and Compliance:** Understanding data governance frameworks and compliance requirements is critical, especially in highly regulated industries. - **Leadership and Communication:** Beyond technical skills, the ability to lead projects, communicate effectively with both technical and non-technical team members, and mentor junior engineers is what differentiates a senior engineer. ### Learning Pathways Becoming a senior data engineer requires continuous learning and real-world experience. Here are a few steps to guide your journey: - **Educational Foundation:** Start with a strong foundation in computer science or a related field. This can be through formal education or self-study courses. - **Gain Practical Experience:** Apply your skills in real-world projects. This could be in a professional setting, contributions to open-source projects, or personal projects. - **Specialize and Certify:** Consider specializing in areas particularly relevant to your interests or industry needs. Obtaining certifications in specific technologies or platforms can also bolster your credentials. - **Develop Soft Skills:** Work on your communication, project management, and leadership skills. These are as critical as your technical abilities. - **Seek Feedback and Mentorship:** Learn from the experiences of others. Seek out mentors who can provide guidance and feedback on your progress. ### Video 1 | Live Stream -> How to become a Senior Data Engineer - Part 1 |------------------| | In this part one I talked about Data Modeling, Big Data, ETL, Data Warehousing & Data Lakes as well as Cloud computing | [Watch on YouTube](https://youtube.com/live/M-6xkTCKQQc)| ![Watch on YouTube](/images/Becoming-a-Senior-Data-Engineer-Video-1.jpg) ### Video 2 | Live Stream -> How to become a Senior Data Engineer - Part 2 |------------------| | In part two I talked about real time data processing, programming & scripting, data governance, compliance and data security | [Watch on YouTube](https://youtube.com/live/po96pzpjxvA)| ![Watch on YouTube](/images/Becoming-a-Senior-Data-Engineer-Video-2.jpg) ### Video 3 | Live Stream -> How to become a Senior Data Engineer - Part 3 |------------------| | In part 3 I focused on everything regarding Leadership and Communication: team management, project management, collaboration, problem solving, strategic thinking, communication and leadership | [Watch on YouTube](https://youtube.com/live/DMumpzSyRjI)| ![Watch on YouTube](/images/Becoming-a-Senior-Data-Engineer-Video-3.jpg) ### Final Thoughts The path to becoming a senior data engineer is both challenging and rewarding. It requires a blend of technical prowess, continuous learning, and the development of soft skills that enable you to lead and innovate. Whether you're just starting out or looking to advance your career, focusing on the key areas outlined above will set you on the right path. ================================================ FILE: sections/02-BasicSkills.md ================================================ Basic Computer Science Skills ============================= ## Contents - [Learn to Code](02-BasicSkills.md#learn-to-code) - [Get Familiar with Git](02-BasicSkills.md#get-familiar-with-git) - [Agile Development](02-BasicSkills.md#agile-development) - [Why Is Agile So Important?](02-BasicSkills.md#Why-is-agile-so-important) - [Agile Rules I Learned Over the Years](02-BasicSkills.md#agile-rules-i-learned-over-the-years) - [Agile Frameworks](02-BasicSkills.md#agile-frameworks) - [Scrum](02-BasicSkills.md#scrum) - [OKR](02-BasicSkills.md#okr) - [Software Engineering Culture](02-BasicSkills.md#software-engineering-culture) - [Learn How a Computer Works](02-BasicSkills.md#learn-how-a-computer-works) - [Data Network Transmission](02-BasicSkills.md#data-network-transmission) - [Security and Privacy](02-BasicSkills.md#security-and-privacy) - [SSL Public and Private Key Certificates](02-BasicSkills.md#ssl-public-and-private-key-Certificates) - [JSON Web Tokens](02-BasicSkills.md#json-web-tokens) - [GDPR Regulations](02-BasicSkills.md#gdpr-regulations) - [Linux](02-BasicSkills.md#linux) - [OS Basics](02-BasicSkills.md#os-basics) - [Shell Scripting](02-BasicSkills.md#shell-scripting) - [Cron Jobs](02-BasicSkills.md#cron-jobs) - [Packet Management](02-BasicSkills.md#packet-management) - [Docker](02-BasicSkills.md#docker) - [What is Docker and How it Works](02-BasicSkills.md#what-is-docker-and-what-do-you-use-it-for) - [Kubernetes Container Deployment](02-BasicSkills.md#kubernetes-container-deployment) - [Why and How To Do Docker Container Orchestration](02-BasicSkills.md#why-and-how-to-do-docker-container-orchestration) - [Useful Docker Commands](02-BasicSkills.md#useful-docker-commands) - [The Cloud](02-BasicSkills.md#the-cloud) - [IaaS vs. PaaS vs. SaaS](02-BasicSkills.md#iaas-vs-paas-vs-saas) - [AWS Azure IBM Google](02-BasicSkills.md#aws-azure-ibm-google) - [Cloud vs. On-Premises](02-BasicSkills.md#cloud-vs-on-premises) - [Security](02-BasicSkills.md#security) - [Hybrid Clouds](02-BasicSkills.md#hybrid-clouds) - [Data Scientists and Machine Learning](02-BasicSkills.md#Data-Scientists-and-Machine-Learning) - [Machine Learning Workflow](02-BasicSkills.md#machine-learning-workflow) - [Machine Learning Model and Data](02-BasicSkills.md#machine-learning-model-and-data) Learn to Code ------------- Why this is important: Without coding you cannot do much in data engineering. I cannot count the number of times I needed a quick hack to solve a problem. The possibilities are endless: - Writing or quickly getting some data out of a SQL DB. - Testing to produce messages to a Kafka topic. - Understanding the source code of a Webservice - Reading counter statistics out of a HBase key-value store. So, which language do I recommend then? If you would asked me a few years ago I would have said Java, 100%. Nowadays though the community moved heavily to Python. I highly recommend starting with it. When you are getting into data processing with Spark you can use Scala which is a JVM language, but Python is also very good here. Python is a great choice. It is super versatile. Where to Learn Python? There are free Python courses all over the internet. - I have a beginner one in my Data Engineering academy: [Introduction to Python course](https://learndataengineering.com/p/introduction-to-python) - I also have a Python for Data Engineers one one in my Data Engineering academy: [Python for Data Engineers course](https://learndataengineering.com/p/python-for-data-engineers) Keep in mind to always keep it practical: Learning by doing! I talked about the importance of learning by doing in this podcast: Get Familiar with Git --------------------- Why this is important: One of the major problems with coding is to keep track of changes. It is also almost impossible to maintain a program you have multiple versions of. Another problem is the topic of collaboration and documentation, which is super important. Let's say you work on a Spark application and your colleagues need to make changes while you are on holiday. Without some code management, they are in huge trouble: Where is the code? What have you changed last? Where is the documentation? How do we mark what we have changed? But, if you put your code on GitHub, your colleagues can find your code. They can understand it through your documentation (please also have in-line comments). Developers can pull your code, make a new branch, and do the changes. After your holiday, you can inspect what they have done and merge it with your original code, and you end up having only one application. Where to learn: Check out the GitHub Guides page where you can learn all the basics: This great GitHub commands cheat sheet saved my butt multiple times: Also look into: - Pull - Push - Branching - Forking GitHub uses markdown to write pages, a super simple language that is actually a lot of fun to write. Here's a markdown cheat cheatsheet: Pandoc is a great tool to convert any text file to and from markdown: Agile Development ----------------- Agility is the ability to adapt quickly to changing circumstances. These days, everyone wants to be agile. Big and small companies are looking for the "startup mentality." Many think it's the corporate culture. Others think it's the process of how we create things that matters. In this article, I am going to talk about agility and self-reliance, about how you can incorporate agility in your professional career. ### Why Is Agile So Important? Historically, development has been practiced as an explicitly defined process. You think of something, specify it, have it developed, and then build in mass production. It's a bit of an arrogant process. You assume that you already know exactly what a customer wants, or how a product has to look and how everything works out. The problem is that the world does not work this way! Oftentimes the circumstances change because of internal factors. Sometimes things just do not work out as planned or stuff is harder than you think. You need to adapt. Other times you find out that you built something customers do not like and needs to be changed. You need to adapt. That's why people jump on the Scrum train -- because Scrum is the definition of agile development, right? ### Agile Rules I Learned Over the Years #### Is the Method Making a Difference? Yes, Scrum or Google's OKR can help to be more agile. The secret to being agile, however, is not only how you create. What makes me cringe is people trying to tell you that being agile starts in your head. So, the problem is you? No! The biggest lesson I have learned over the past years is this: Agility goes down the drain when you outsource work. #### The Problem with Outsourcing I know on paper outsourcing seems like a no-brainer: development costs against the fixed costs. It is expensive to bind existing resources on a task. It is even more expensive if you need to hire new employees. The problem with outsourcing is that you pay someone to build stuff for you. It does not matter who you pay to do something for you. He needs to make money. His agenda will be to spend as little time as possible on your work. That is why outsourcing requires contracts, detailed specifications, timetables, and delivery dates. He doesn't want to spend additional time on a project, only because you want changes in the middle. Every unplanned change costs him time and therefore money. If so, you need to make another detailed specification and a contract change. He is not going to put his mind into improving the product while developing. Firstly, because he does not have the big picture. Secondly, because he does not want to. He is doing as he is told. Who can blame him? If I were the subcontractor, I would do exactly the same! Does this sound agile to you? #### Knowledge Is King: A lesson from Elon Musk Doing everything in house -- that's why startups are so productive. No time is wasted on waiting for someone else. If something does not work or needs to be changed, there is someone on the team who can do it right away. One very prominent example who follows this strategy is Elon Musk. Tesla's Gigafactories are designed to get raw materials in on one side and spit out cars on the other. Why do you think Tesla is building Gigafactories that cost a lot of money? Why is SpaceX building its own space engines? Clearly, there are other, older companies who could do that for them. Why is Elon building tunnel boring machines at his new boring company? At first glance, this makes no sense! #### How You Really Can Be Agile If you look closer, it all comes down to control and knowledge. You, your team, your company, needs to do as much as possible on your own. Self-reliance is king. Build up your knowledge and therefore the team's knowledge. When you have the ability to do everything yourself, you are in full control. You can build electric cars, build rocket engines, or bore tunnels. Don't largely rely on others, and be confident to just do stuff on your own. Dream big, and JUST DO IT! PS. Don't get me wrong. You can still outsource work. Just do it in a smart way by outsourcing small independent parts. ### Agile Frameworks #### Scrum There's an interesting Medium article with a lot of details about Scrum: Also, this Scrum guide webpage has good info: #### OKR I personally love OKR and have been using it for years. Especially for smaller teams, OKR is great. You don't have a lot of overhead and get work done. It helps you stay focused and look at the bigger picture. I recommend doing a sync meeting every Monday. There you talk about what happened last week and what you are going to work on this week. I talked about this in this podcast: There is also this awesome 1,5-hour startup guide from Google: I really love this video; I rewatched it multiple times. ### Software Engineering Culture The software engineering and development culture is super important. How does a company handle product development with hundreds of developers? Check out this podcast: | Podcast episode: #070 Engineering Culture At Spotify |------------------ |In this podcast, we look at the engineering culture at Spotify, my favorite music streaming service. The process behind the development of Spotify is really awesome. |[Watch on YouTube](https://youtu.be/1asVrsUDbp0) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/070-The-Engineering-Culture-At-Spotify-e45ipa)| **Some interesting slides:** Learn How a Computer Works -------------------------- ### CPU,RAM,GPU,HDD ### Differences Between PCs and Servers I talked about computer hardware and GPU processing in this podcast: Data Network Transmission --------------------------------------- ### OSI Model The OSI Model describes how data flows through the network. It consists of layers starting from physical layers, basically how the data is transmitted over the line or optic fiber. Check out this article for a deeper understanding of the layers and processes outlined in the OSI model: The Wikipedia page is also very good: ###### Which Protocol Lives on Which Layer? Check out this network protocol map. Unfortunately, it is really hard to find it theses days: ### IP Subnetting Check out this IP address and subnet guide from Cisco: A calculator for subnets: ### Switch, Layer-3 Switch For an introduction to how ethernet went from broadcasts, to bridges, to Ethernet MAC switching, to ethernet & IP (layer 3) switching, to software-defined networking, and to programmable data planes that can switch on any packet field and perform complex packet processing, see this video: ### Router ### Firewalls I talked about network infrastructure and techniques in this podcast: Security and Privacy -------------------- ### SSL Public and Private Key Certificates ### JSON Web Tokens Link to the Wiki page: ### GDPR Regulations The EU created the GDPR \"General Data Protection Regulation\" to protect your personal data like: name, age, address, and so on. It's huge and quite complicated. If you want to do online business in the EU, you need to apply these rules. The GDPR is applicable since May 25th, 2018. So, if you haven't looked into it, now is the time. The penalties can be crazy high if you make mistakes here. Check out the full GDPR regulation here: By the way, if you do profiling or analyse big data in general, look into it. There are some important regulations, unfortunately. I spend months with GDPR compliance. Super fun. Not! Hahaha ### Privacy by Design When should you look into privacy regulations and solutions? Creating the product or service first and then bolting on the privacy is a bad choice. The best way is to start implementing privacy right away in the engineering phase. This is called privacy by design. Privacy is an integral part of your business, not just something optional. Check out the Wikipedia page to get a feeling for the important principles: Linux ----- Linux is very important to learn, at least the basics. Most big-data tools or NoSQL databases run on Linux. From time to time, you need to modify stuff through the operating system, especially if you run an infrastructure as a service solution like Cloudera CDH, Hortonworks, or a MapR Hadoop distribution. ### OS Basics Show all historic commands: history | grep docker ### Shell scripting Ah, creating shell scripts in 2019? Believe it or not, scripting in the command line is still important. Start a process, automatically rename, move or do a quick compaction of log files. It still makes a lot of sense. Check out this cheat sheet to get started with scripting in Linux: There's also this Medium article with a super-simple example for beginners: ### Cron Jobs Cron jobs are super important to automate simple processes or jobs in Linux. You need this here and there, I promise. Check out these three guides: And, of course, Wikipedia, which is surprisingly good: Pro tip: Don't forget to end your cron files with an empty line or a comment, otherwise it will not work. ### Packet Management Linux tips are the second part of this podcast: Docker ------ ### What is Docker, and What Do You Use It for? Have you played around with Docker yet? If you're a data science learner or a data scientist, you need to check it out! It's awesome because it simplifies the way you can set up development environments for data science. If you want to set up a dev environment, you usually have to install a lot of packages and tools. #### Don't Mess Up Your System What this does is basically mess up your operating system. If you're just starting out, you don't know which packages you need to install. You don't know which tools you need to install. If you want to, for instance, start with Jupyter Notebooks, you need to install that on your PC somehow. Or, you need to start installing tools like PyCharm or Anaconda. All that gets added to your system, and so you mess up your system more and more and more. What Docker brings you, especially if you're on a Mac or a Linux system, is simplicity. #### Preconfigured Images Because it is so easy to install on those systems, another cool thing about Docker images is you can just search them in the Docker store, download them, and install them on your system. Running them in a completely pre-configured environment, you don't need to think about stuff. You go to the Docker library, and you search for Deep Learning, GPU and Python. You get a list of images you can download. You download one, start it up, go to the browser and hit up the URL, and just start coding. Start doing the work. The only other thing you need to do is bind some drives to that instance so you can exchange files. And, then that's it! There is no way that you can crash or mess up your system. It's all encapsulated into Docker. Why this works is because Docker has native access to your hardware. #### Take It With You It's not a completely virtualized environment like a VirtualBox. An image has the upside that you can take it wherever you want. So, if you're on your PC at home, use that there. Make a quick build, take the image, and go somewhere else. Install the image, which is usually quite fast, and just use it like you're at home. It's that awesome! ### Kubernetes Container Deployment I am getting into Docker a lot more myself. For a some different reasons. What I'm looking for is using Docker with Kubernetes. With Kubernetes, you can automate the whole container deployment process. The idea is that you have a cluster of machines. Lets say you have a 10-server cluster and you run Kubernetes on it. Kubernetes lets you spin up Docker containers on demand to execute tasks. You can set up how much resources like CPU, RAM, and network your Docker container can use. You can basically spin up containers, on the cluster on demand, whenever you need to do an analytics task. That's perfect for data science. ### How to Create, Start, Stop a Container ### Docker Micro-Services? ### Kubernetes ### Why and How to Do Docker Container Orchestration Podcast about how data science learners use Docker (for data scientists): ### Useful Docker Commands Create a container: docker run CONTAINER --network NETWORK Start a stopped container: docker start CONTAINER NAME Stop a running container: docker stop List all running containers: docker ps List all containers including stopped ones: docker ps -a Inspect the container configuration (e.g. network settings, etc.): docker inspect CONTAINER List all available virtual networks: docker network ls Create a new network: docker network create NETWORK --driver bridge Connect a running container to a network: docker network connect NETWORK CONTAINER Disconnect a running container from a network: docker network disconnect NETWORK CONTAINER Remove a network: docker network rm NETWORK The Cloud --------- ### IaaS vs. PaaS vs. SaaS Check out this podcast. It will help you understand the difference and how to decide what to use. | Podcast episode: #082 Reading Tweets With Apache Nifi & IaaS vs PaaS vs SaaS |------------------| |In this episode, we talk about the differences between infrastructure as a service, platform as a service, and application as a service. Then, we install the Nifi Docker container and look into how we can extract the twitter data. | [Watch on YouTube](https://youtu.be/pWuT4UAocUY) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/082-Reading-Tweets-With-Apache-Nifi--IaaS-vs-PaaS-vs-SaaS-e45j50)| ### AWS, Azure, IBM, Google Each of these have their own answer to IaaS, Paas, and SaaS. Pricing and pricing models vary greatly between each provider. Likewise, each provider's service may have limitations and strengths. #### AWS Here is the [full list of AWS services](https://www.amazonaws.cn/en/products/). Studying for the [AWS Certified Cloud Practitioner](https://aws.amazon.com/certification/certified-cloud-practitioner/?ch=cta&cta=header&p=2) and/or [AWS Certified Solutions Architect](https://aws.amazon.com/certification/certified-solutions-architect-associate/?ch=sec&sec=rmg&d=1) exams can be helpful to quickly gain an understanding of all these services. Here are links for free digital training for the [AWS Certified Cloud Practitioner](https://explore.skillbuilder.aws/learn/public/learning_plan/view/82/cloud-foundations-learning-plan) and [AWS Certified Solutions Architect Associate](https://explore.skillbuilder.aws/learn/public/learning_plan/view/78/architect-learning-plan). Here is a free 17 hour [Data Analytics Learning plan](https://explore.skillbuilder.aws/learn/public/learning_plan/view/97/data-analytics-learning-plan) for AWS's [Analytics](https://aws.amazon.com/big-data/datalakes-and-analytics/?nc2=h_ql_prod_an)/Data Engineering services. #### Azure [Full list of Azure services](https://azure.microsoft.com/en-us/services/). [Get started with mini courses](https://docs.microsoft.com/en-us/learn/browse/). #### IBM #### Google Google Cloud Platform offers a wide, ever-evolving variety of services. [List of GCP services with brief description](https://github.com/gregsramblings/google-cloud-4-words). In recent years, documentation and tutorials have com a long way to help [getting started with GCP](https://cloud.google.com/gcp/getting-started/). You can start with a free account, but to use many of the services, you will need to turn on billing. Once you do enable billing, always remember to turn off services that you have spun up for learning purposes. It is also a good idea to turn on billing limits and alerts. ### Cloud vs. On-Premises | Podcast episode: #076 Cloud vs. On-Premise |------------------| |How to choose between cloud and on-premises, pros and cons and what you have to think about. There are good reasons to not go cloud. Also, thoughts on how to choose between the cloud providers by just comparing instance prices. Otherwise, the comparison will drive you insane. My suggestion: Basically use them as IaaS and something like Cloudera as PaaS. Then build your solution on top of that. | [Watch on YouTube](https://youtu.be/BAzj0yGcrnE) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/076-Cloud-vs-On-Premise-How-To-Decide-e45ivk)| ### Security Listen to a few thoughts about the cloud in this podcast: ### Hybrid Clouds Hybrid clouds are a mixture of on-premises and cloud deployment. A very interesting example for this is Google Anthos: # Data Scientists and Machine Learning Data scientists aren't like every other scientist. Data scientists do not wear white coats or work in high tech labs full of science fiction movie equipment. They work in offices just like you and me. What differs them from most of us is that they are math experts. They use linear algebra and multivariable calculus to create new insight from existing data. How exactly does this insight look? Here's an example: An industrial company produces a lot of products that need to be tested before shipping. Usually such tests take a lot of time because there are hundreds of things to be tested. All to make sure that your product is not broken. Wouldn't it be great to know early if a test fails ten steps down the line? If you knew that you could skip the other tests and just trash the product or repair it. That's exactly where a data scientist can help you, big-time. This field is called predictive analytics and the technique of choice is machine learning. Machine what? Learning? Yes, machine learning, it works like this: You feed an algorithm with measurement data. It generates a model and optimises it based on the data you fed it with. That model basically represents a pattern of how your data is looking. You show that model new data and the model will tell you if the data still represents the data you have trained it with. This technique can also be used for predicting machine failure in advance with machine learning. Of course the whole process is not that simple. The actual process of training and applying a model is not that hard. A lot of work for the data scientist is to figure out how to pre-process the data that gets fed to the algorithms. In order to train an algorithm you need useful data. If you use any data for the training the produced model will be very unreliable. An unreliable model for predicting machine failure would tell you that your machine is damaged even if it is not. Or even worse: It would tell you the machine is ok even when there is a malfunction. Model outputs are very abstract. You also need to post-process the model outputs to receive the outputs you desire ![The Machine Learning Pipeline](/images/Machine-Learning-Pipeline.jpg) ## Machine Learning Workflow ![The Machine Learning Workflow](/images/Machine-Learning-Workflow.jpg) Data Scientists and Data Engineers. How does that all fit together? You have to look at the data science process. How stuff is created and how data science is done. How machine learning is done. The machine learning process shows, that you start with a training phase. A phase where you are basically training the algorithms to create the right output. In the learning phase you are having the input parameters. Basically the configuration of the model and you have the input data. What you're doing is you are training the algorithm. While training the algorithm modifies the training parameters. It also modifies the used data and then you are getting to an output. Once you get an output you are evaluating. Is that output okay, or is that output not the desired output? if the output is not what you were looking for? Then you are continuing with the training phase. You're trying to retrain the model hundreds, thousands, hundred thousands of times. Of course all this is being done automatically. Once you are satisfied with the output, you are putting the model into production. In production it is no longer fed with training data it's fed with the live data. It's evaluating the input data live and putting out live results. So, you went from training to production and then what? What you do is monitoring the output. If the output keeps making sense, all good! If the output of the model changes and it's on longer what you have expected, it means the model doesn't work anymore. You need to trigger a retraining of the model. It basically gets to getting trained again. Once you are again satisfied with the output, you put it into production again. It replaces the one in production. This is the overall process how machine learning. It's how the learning part of data science is working. ## Machine Learning Model and Data ![The Machine Learning Model](/images/Machine-Learning-Model.jpg) Now that's all very nice. When you look at it, you have two very important places where you have data. You have in the training phase two types of data: Data that you use for the training. Data that basically configures the model, the hyper parameter configuration. Once you're in production you have the live data that is streaming in. Data that is coming in from from an app, from a IoT device, logs, or whatever. A data catalog is also important. It explains which features are available and how different data sets are labeled. All different types of data. Now, here comes the engineering part. The Data Engineers part, is making this data available. Available to the data scientist and the machine learning process. So when you look at the model, on the left side you have your hyper parameter configuration. You need to store and manage these configurations somehow. Then you have the actual training data. There's a lot going on with the training data: Where does it come from? Who owns it? Which is basically data governance. What's the lineage? Have you modified this data? What did you do, what was the basis, the raw data? You need to access all this data somehow. In training and in production. In production you need to have access to the live data. All this is the data engineers job. Making the data available. First an architect needs to build the platform. This can also be a good data engineer. Then the data engineer needs to build the pipelines. How is the data coming in and how is the platform connecting to other systems. How is that data then put into the storage. Is there a pre processing for the algorithms necessary? He'll do it. Once the data and the systems are available, it's time for the machine learning part. It is ready for processing. Basically ready for the data scientist. Once the analytics is done the data engineer needs to build pipelines to make it then accessible again. For instance for other analytics processes, for APIs, for front ends and so on. All in all, the data engineer's part is a computer science part. That's why I love it so much :) ================================================ FILE: sections/03-AdvancedSkills.md ================================================ Advanced Data Engineering Skills ================================ ## Contents - [Data Science Platform](03-AdvancedSkills.md#data-science-platform) - [Why a Good Data Platform Is Important](03-AdvancedSkills.md#why-a-good-data-platform-is-important) - [Big Data vs Data Science and Analytics](03-AdvancedSkills.md#Big-Data-vs-Data-Science-and-Analytics) - [The 4 Vs of Big Data](03-AdvancedSkills.md#the-4-vs-of-big-data) - [Why Big Data](03-AdvancedSkills.md#why-big-data) - [Planning is Everything](03-AdvancedSkills.md#planning-is-everything) - [The Problem with ETL](03-AdvancedSkills.md#the-problem-with-etl) - [Scaling Up](03-AdvancedSkills.md#scaling-up) - [Scaling Out](03-AdvancedSkills.md#scaling-out) - [When not to Do Big Data](03-AdvancedSkills.md#please-dont-go-big-data) - [81 Platform & Pipeline Design Questions](03-AdvancedSkills.md#81-platform-and-pipeline-design-questions) - [Data Source Questions](03-AdvancedSkills.md#data-source-questions) - [Goals and Destination Questions](03-AdvancedSkills.md#goals-and-destination-questions) - [Connect](03-AdvancedSkills.md#connect) - [REST APIs](03-AdvancedSkills.md#rest-apis) - [API Design](03-AdvancedSkills.md#api-design) - [Implementation Frameworks](03-AdvancedSkills.md#implementation-frameworks) - [Security](03-AdvancedSkills.md#security) - [Apache Nifi](03-AdvancedSkills.md#apache-nifi) - [Logstash](03-AdvancedSkills.md#logstash) - [Buffer](03-AdvancedSkills.md#buffer) - [Apache Kafka](03-AdvancedSkills.md#apache-kafka) - [Why a Message Queue Tool?](03-AdvancedSkills.md#why-a-message-queue-tool) - [Kafka Architecture](03-AdvancedSkills.md#kafka-architecture) - [Kafka Topics](03-AdvancedSkills.md#what-are-topics) - [Kafka and Zookeeper](03-AdvancedSkills.md#what-does-zookeeper-have-to-do-with-kafka) - [How to Produce and Consume Messages](03-AdvancedSkills.md#how-to-produce-and-consume-messages) - [Kafka Commands](03-AdvancedSkills.md#kafka-commands) - [Apache Redis Pub-Sub](03-AdvancedSkills.md#redis-pub-sub) - [AWS Kinesis](03-AdvancedSkills.md#apache-kafka) - [Google Cloud PubSub](03-AdvancedSkills.md#google-cloud-pubsub) - [Processing Frameworks](03-AdvancedSkills.md#processing-frameworks) - [Lambda and Kappa Architecture](03-AdvancedSkills.md#lambda-and-kappa-architecture) - [Batch Processing](03-AdvancedSkills.md#batch-processing) - [Stream Processing](03-AdvancedSkills.md#stream-processing) - [Three Methods of Streaming](03-AdvancedSkills.md#three-methods-of-streaming) - [At Least Once](03-AdvancedSkills.md#at-least-once) - [At Most Once](03-AdvancedSkills.md#at-most-once) - [Exactly Once](03-AdvancedSkills.md#exactly-once) - [Check The Tools](03-AdvancedSkills.md#check-the-tools) - [Should You do Stream or Batch Processing](03-AdvancedSkills.md#should-you-do-stream-or-batch-processing) - [Is ETL still relevant for Analytics?](03-AdvancedSkills.md#is-etl-still-relevant-for-analytics) - [MapReduce](03-AdvancedSkills.md#mapreduce) - [How Does MapReduce Work](03-AdvancedSkills.md#How-does-mapreduce-work) - [MapReduce](03-AdvancedSkills.md#mapreduce) - [MapReduce Example](03-AdvancedSkills.md#example) - [MapReduce Limitations](03-AdvancedSkills.md#What-is-the-limitation-of-mapreduce) - [Apache Spark](03-AdvancedSkills.md#apache-spark) - [What is the Difference to MapReduce?](03-AdvancedSkills.md#what-is-the-difference-to-MapReduce) - [How Spark Fits to Hadoop](03-AdvancedSkills.md#how-does-spark-fit-to-hadoop) - [Spark vs Hadoop](03-AdvancedSkills.md#wheres-the-difference) - [Spark and Hadoop a Perfect Fit](03-AdvancedSkills.md#spark-and-hadoop-is-a-perfect-fit) - [Spark on YARn](03-AdvancedSkills.md#spark-on-yarn) - [My Simple Rule of Thumb](03-AdvancedSkills.md#my-simple-rule-of-thumb) - [Available Languages](03-AdvancedSkills.md#available-languages) - [Spark Driver Executor and SparkContext](03-AdvancedSkills.md#how-spark-works-driver-executor-sparkcontext) - [Spark Batch vs Stream processing](03-AdvancedSkills.md#spark-batch-vs-stream-processing) - [How Spark uses Data From Hadoop](03-AdvancedSkills.md#How-does-spark-use-data-from-hadoop) - [What are RDDs and How to Use Them](03-AdvancedSkills.md#what-are-rdds-and-how-to-use-them) - [SparkSQL How and Why to Use It](03-AdvancedSkills.md#available-languages) - [What are Dataframes and How to Use Them](03-AdvancedSkills.md#what-are-dataframes-how-to-use-them) - [Machine Learning on Spark (TensorFlow)](03-AdvancedSkills.md#machine-learning-on-spark-tensor-flow) - [MLlib](03-AdvancedSkills.md#mllib) - [Spark Setup](03-AdvancedSkills.md#spark-setup) - [Spark Resource Management](03-AdvancedSkills.md#spark-resource-management) - [AWS Lambda](03-AdvancedSkills.md#apache-flink) - [Apache Flink](03-AdvancedSkills.md#apache-flink) - [Elasticsearch](03-AdvancedSkills.md#elasticsearch) - [Apache Drill](03-AdvancedSkills.md#apache-drill) - [StreamSets](03-AdvancedSkills.md#streamsets) - [Store](03-AdvancedSkills.md#store) - [Analytical Data Stores](03-AdvancedSkills.md#analytical-data-stores) - [Data Warehouse vs Data Lake](03-AdvancedSkills.md#data-warehouse-vs-data-lake) - [Snowflake and dbt](03-AdvancedSkills.md#snowflake-and-dbt) - [Transactional Data Stores](03-AdvancedSkills.md#transactional-data-stores) - [SQL Databases](03-AdvancedSkills.md#sql-databases) - [PostgreSQL DB](03-AdvancedSkills.md#postgresql-db) - [Database Design](03-AdvancedSkills.md#database-design) - [SQL Queries](03-AdvancedSkills.md#sql-queries) - [Stored Procedures](03-AdvancedSkills.md#stored-procedures) - [ODBC/JDBC Server Connections](03-AdvancedSkills.md#odbc-jdbc-server-connections) - [NoSQL Stores](03-AdvancedSkills.md#nosql-stores) - [HBase KeyValue Store](03-AdvancedSkills.md#keyvalue-stores-hbase) - [HDFS Document Store](03-AdvancedSkills.md#document-stores-hdfs) - [MongoDB Document Store](03-AdvancedSkills.md#document-stores-mongodb) - [Elasticsearch Document Store](03-AdvancedSkills.md#Elasticsearch-search-engine-and-document-store) - [Graph Databases (Neo4j)](03-AdvancedSkills.md#graph-db-neo4j) - [Impala](03-AdvancedSkills.md#impala) - [Kudu](03-AdvancedSkills.md#kudu) - [Apache Druid](03-AdvancedSkills.md#apache-druid) - [InfluxDB Time Series Database](03-AdvancedSkills.md#influxdb-time-series-database) - [Greenplum MPP Database](03-AdvancedSkills.md#mpp-databases-greenplum) - [NoSQL Data Warehouses](03-AdvancedSkills.md#nosql-data-warehouses) - [Hive Warehouse](03-AdvancedSkills.md#hive-warehouse) - [Impala](03-AdvancedSkills.md#impala) - [Visualize](03-AdvancedSkills.md#visualize) - [Android and IOS](03-AdvancedSkills.md#android-and-ios) - [API Design for Mobile Apps](03-AdvancedSkills.md#how-to-design-apis-for-mobile-apps) - [Dashboards](03-AdvancedSkills.md#dashboards) - [Grafana](03-AdvancedSkills.md#grafana) - [Kibana](03-AdvancedSkills.md#kibana) - [Webservers](03-AdvancedSkills.md#how-to-use-webservers-to-display-content) - [Tomcat](03-AdvancedSkills.md#tomcat) - [Jetty](03-AdvancedSkills.md#jetty) - [NodeRED](03-AdvancedSkills.md#nodered) - [React](03-AdvancedSkills.md#react) - [Business Intelligence Tools](03-AdvancedSkills.md#business-intelligence-tools) - [Tableau](03-AdvancedSkills.md#tableau) - [Power BI](03-AdvancedSkills.md#power-bi) - [Quliksense](03-AdvancedSkills.md#quliksense) - [Identity & Device Management](03-AdvancedSkills.md#Identity-and-device-management) - [What Is A Digital Twin](03-AdvancedSkills.md#what-is-a-digital-twin) - [Active Directory](03-AdvancedSkills.md#active-directory) - [Machine Learning](03-AdvancedSkills.md#machine-learning) - [How to do Machine Learning in production](03-AdvancedSkills.md#how-to-domachine-learning-in-production) - [Why machine learning in production is harder then you think](03-AdvancedSkills.md#why-machine-learning-in-production-is-harder-then-you-think) - [Models Do Not Work Forever](03-AdvancedSkills.md#models-do-not-work-forever) - [Where are The Platforms That Support Machine Learning](03-AdvancedSkills.md#where-are-the-platforms-that-support-this) - [Training Parameter Management](03-AdvancedSkills.md#training-parameter-management) - [How to Convince People That Machine Learning Works](03-AdvancedSkills.md#how-to-convince-people-machine-learning-works) - [No Rules No Physical Models](03-AdvancedSkills.md#no-rules-no-physical-models) - [You Have The Data. Use It!](03-AdvancedSkills.md#you-have-the-data-use-it) - [Data is Stronger Than Opinions](03-AdvancedSkills.md#data-is-stronger-than-opinions) - [AWS Sagemaker](03-AdvancedSkills.md#aws-sagemaker) ## Data Science Platform ### Why a Good Data Platform Is Important | Podcast Episode: #066 How To Do Data Science From A Data Engineers Perspective |------------------| |A simple introduction how to do data science in the context of the internet of things. | [Watch on YouTube](https://youtu.be/yp_cc4R0mGQ) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/066-How-To-Do-Data-Science-From-A-Data-Engineers-Perspective-e45imt)| ### Big Data vs Data Science and Analytics I talked about the difference in this podcast: ### The 4 Vs of Big Data It is a complete misconception. Volume is only one part of the often called four V's of big data: Volume, velocity, variety and veracity. **Volume** is about the size - How much data you have **Velocity** is about the speed - How fast data is getting to you How much data in a specific time needs to get processed or is coming into the system. This is where the whole concept of streaming data and real-time processing comes in to play. **Variety** is about the variety - How different your data is Like CSV files, PDFs that you have and stuff in XML. That you also have JSON logfiles, or data in some kind of a key-value store. It's about the variety of data types from different sources that you basically want to join together. All to make an analysis based on that data. **Veracity** is about the credibility - How reliable your data is The issue with big data is, that it is very unreliable. You cannot really trust the data. Especially when you're coming from the Internet of Things (IoT) side. Devices use sensors for measurement of temperature, pressure, acceleration and so on. You cannot always be hundred percent sure that the actual measurement is right. When you have data that is from for instance SAP and it contains data that is created by hand you also have problems. As you know we humans are bad at inputting stuff. Everybody articulates differently. We make mistakes, down to the spelling and that can be a very difficult issue for analytics. I talked about the 4Vs in this podcast: ### Why Big Data? What I always emphasize is that the four V's are quite nice. They give you a general direction. There is a much more important issue: Catastrophic Success. What I mean by catastrophic success is, that your project, your startup or your platform has more growth that you anticipated. Exponential growth is what everybody is looking for. Because with exponential growth there is the money. It starts small and gets very big very fast. The classic hockey stick curve: 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, .... BOOM! Think about it. It starts small and quite slow, but gets very big very fast. You get a lot of users or customers who are paying money to use your service, the platform or whatever. If you have a system that is not equipped to scale and process the data the whole system breaks down. That's catastrophic success. You are so successful and grow so fast that you cannot fulfill the demand anymore. And so you fail and it's all over. It's now like you just can make that up while you go. That you can foresee in a few months or weeks the current system doesn't work anymore. ### Planning is Everything It's all happens very very fast and you cannot react anymore. There's a necessary type of planning and analyzing the potential of your business case necessary. Then you need to decide if you actually have big data or not. You need to decide if you use big data tools. This means when you conceptualize the whole infrastructure it might look ridiculous to actually focus on big data tools. But in the long run it will help you a lot. Good planning will get a lot of problems out of the way, especially if you think about streaming data and real-time analytics. ### The problem with ETL A typical old-school platform deployment would look like the picture below. Devices use a data API to upload data that gets stored in a SQL database. An external analytics tool is querying data and uploading the results back to the SQL DB. Users then use the user interface to display data stored in the database. ![Common SQL Platform Architecture](/images/Common-SQL-Architecture.jpg) Now, when the front end queries data from the SQL database the following three steps happen: \- The database extracts all the needed rows from the storage. (E) - The extracted data gets transformed, for instance sorted by timestamp or something a lot more complex. (T) - The transformed data is loaded to the destination (the user interface) for chart creation. (L) With exploding amounts of stored data the ETL process starts being a real problem. Analytics is working with large data sets, for instance whole days, weeks, months or more. Data sets are very big like 100GB or Terabytes. That means Billions or Trillions of rows. This has the result that the ETL process for large data sets takes longer and longer. Very quickly the ETL performance gets so bad it won't deliver results to analytics anymore. A traditional solution to overcome these performance issues is trying to increase the performance of the database server. That's what's called scaling up. ### Scaling Up To scale up the system and therefore increase ETL speeds administrators resort to more powerful hardware by: Speeding up the extract performance by adding faster disks to physically read the data faster. Increasing RAM for row caching. What is already in memory does not have to be read by slow disk drives. Using more powerful CPU's for better transform performance (more RAM helps here as well). Increasing or optimising networking performance for faster data delivery to the front end and analytics. In summary: Scaling up the system is fairly easy. ![Scaling up a SQL Database](/images/SQL-Scaling-UP.jpg) But with exponential growth it is obvious that sooner or later (more sooner than later) you will run into the same problems again. At some point you simply cannot scale up anymore because you already have a monster system, or you cannot afford to buy more expensive hardware. The next step you could take would be scaling out. ### Scaling Out Scaling out is the opposite of scaling up. Instead of building bigger systems the goal is to distribute the load between many smaller systems. The easiest way of scaling out an SQL database is using a storage area network (SAN) to store the data. You can then use up to eight SQL servers (explain), attach them to the SAN and let them handle queries. This way load gets distributed between those eight servers. ![Scaling out a SQL Database](/images/SQL-Scaling-Out.jpg) One major downside of this setup is that, because the storage is shared between the SQL servers, it can only be used as an read only database. Updates have to be done periodically, for instance once a day. To do updates all SQL servers have to detach from the database. Then, one is attaching the DB in read-write mode and refreshing the data. This procedure can take a while if a lot of data needs to be uploaded. This Link (missing) to a Microsoft MSDN page has more options of scaling out an SQL database for you. I deliberately don't want to get into details about possible scaling out solutions. The point I am trying to make is that while it is possible to scale out SQL databases it is very complicated. There is no perfect solution. Every option has its up- and downsides. One common major issue is the administrative effort that you need to take to implement and maintain a scaled out solution. ### Please don't go Big Data If you don't run into scaling issues please, do not use big data tools! Big data is an expensive thing. A Hadoop cluster for instance needs at least five servers to work properly. More is better. Believe me this stuff costs a lot of money. Especially when you are talking about maintenance and development on top big data tools into account. If you don't need it it's making absolutely no sense at all! On the other side: If you really need big data tools they will save your ass :) ## 81 Platform and Pipeline Design Questions Many people ask: "How do you select the platform, tools and design the pipelines?" The options seem infinite. Technology however should never dictate the decisions. Here are 81 questions that you should answer when starting a project ### Data Source Questions #### Data Origin and Structure - **What is the source?** Understand the "device." - **What is the format of the incoming data?** (e.g., JSON, CSV, Avro, Parquet) - **What’s the schema?** - **Is the data structured, semi-structured, or unstructured?** - **What is the data type?** Understand the content of the data. - **Is the schema well-defined, or is it dynamic?** - **How are changes in the data structure from the source (schema evolution) handled?** #### Data Volume & Velocity - **How much data is transmitted per transmission?** - **How fast is the data coming in?** (e.g., messages per minute) - **What is the maximum data volume expected per source per day?** - **What scaling of sources/data is expected?** - **Are there peaks for incoming data?** - **How much data is posted per day across all sources?** - **How does the data volume fluctuate?** (e.g., seasonal peaks, hourly/daily variations) - **How will the system handle bursts of data?** (e.g., throttling or buffering) #### Source Reliability & Redundancy - **Is there data arriving late?** - **Is there a risk of duplicate data from the source?** How will we handle de-duplication? - **How reliable are the sources?** What’s the expected failure rate? - **How do we handle data corruption or loss during transmission?** - **What happens if a source goes offline?** Is there a fallback or failover source? - **Do we need to retry failed transmissions or have fault-tolerance mechanisms in place?** #### Data Extraction & New Sources - **Do we need to extract the data from the sources?** - **How many sources are there?** - **Will new sources be implemented?** #### Data Source Connectivity & Authentication - **How is the data arriving?** (API, bucket, etc.) - **How is the authentication done?** - **What kind of connection is required for the data source?** (e.g., streaming, batch, API) - **What protocols are used for data ingestion?** (e.g., REST, WebSocket, FTP) - **Are there any rate limits or quotas imposed by the data source?** - **How do we handle credentials?** Is there an API? - **What is the retry strategy if data fails to be processed or transmitted?** #### Data Security & Compliance - **Does the data need to be encrypted at the source before being transmitted?** - **Are there any compliance frameworks (e.g., GDPR, HIPAA) that the source data must adhere to?** - **Is there a requirement for data masking or obfuscation at the source?** #### Metadata & Audit - **Is there metadata for the client transmission stored somewhere?** - **What metadata should be captured for each transmission?** (e.g., record counts, latency) - **How do we track and log data ingestion events for audit purposes?** - **Is there a need for tracking data lineage?** (i.e., source origin and changes over time) --- ### Goals and Destination Questions #### Use Case & Data Consumption - **What kind of use case is this?** (Analytics, BI, ML, Transactional processing, Visualization, User Interfaces, APIs) - **What are the typical use cases that require this data?** (e.g., predictive analytics, operational dashboards) - **What are the downstream systems or platforms that will consume this data?** - **How critical is real-time data versus historical data in this use case?** #### Data Query & Delivery - **How is the data visualized?** (raw data, aggregated data) - **How much raw data is processed at once?** - **How much data is cold data, and how often is cold data queried?** - **How fast do the results need to appear?** - **How much data is going to be queried at once?** - **How fresh does the data need to be?** - **How often is the data queried?** (frequency) - **What are the SLAs for delivering data to downstream systems or applications?** #### Aggregation & Modeling - **How is the data aggregated?** (by devices, topic, time) - **When does the aggregation happen?** (on query, on schedule, while streaming) - **What kind of data models are needed for this use case?** (e.g., star schema, snowflake schema) - **Is there a need for pre-aggregations to speed up queries?** - **Should partitioning or indexing strategies be implemented to optimize query performance?** #### Performance & Availability - **What is the processing time requirement?** - **What is the availability of analytics output?** (input vs output delay) - **How fresh does the data need to be?** - **What are the performance expectations for query speed?** - **What is the acceptable query response time for end-users?** - **How will the system handle an increase in concurrent queries from multiple users?** - **What is the expected lag between data ingestion and availability for querying?** - **Do we need horizontal scaling for query engines or databases?** #### Data Lifecycle & Retention - **What’s the data retention time?** - **How often is data archived or moved to lower-cost storage?** - **Will old data need to be transformed or reprocessed for new use cases?** - **What are the data retention policies?** (e.g., hot vs cold storage) - **How will the use case evolve as the data grows?** Will this affect how data is consumed or visualized? #### Monitoring & Debugging - **How will data delivery to the destination be monitored?** (e.g., time-to-load, query failures) - **How will we monitor data pipeline health at the destination?** (e.g., throughput, latency) - **What tools or methods will be used for debugging data delivery failures or performance bottlenecks?** - **What metrics should be tracked to ensure data pipeline health?** (e.g., latency, throughput) - **How do we handle issues such as data corruption or incomplete data at the destination?** #### Data Access & Permissions - **Who is working with the platform, and who has access to query or visualize the data?** - **Which tools are used to query the data?** - **What kind of data export capabilities are required?** (e.g., CSV, API, direct database access) - **Is role-based access control (RBAC) needed to segment data views for different users?** - **How will access to sensitive data be managed?** (e.g., row-level security, encryption) #### Scaling & Future Requirements - **What are the scalability requirements for the data platform as data volume grows?** - **How will future business goals or scalability needs affect the design of data aggregation and retention strategies?** - **How will the system handle an increasing load as more users query data or as data volume grows?** ## Connect ### REST APIs APIs or Application Programming Interfaces are the cornerstones of any great data platform. | Podcast Episode: #033 How APIs Rule The World |------------------| |Strong APIs make a good platform. In this episode I talk about why you need APIs and why Twitter is a great example. Especially JSON APIs are my personal favorite. Because JSON is also important in the Big Data world, for instance in log analytics. How? Check out this episode! | [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/How-APIs-Rule-The-World--PoDS-033-e24ttq)| #### API Design In this podcast episode we look into the Twitter API. It's a great example how to build an API | Podcast Episode: #081 Twitter API Research Data Engineering Course Part 5 |------------------| |In this episode we look into the Twitter API documentation, which I love by the way. How can we get old tweets for a certain hashtags and how to get current live tweets for these hashtags? | [Watch on YouTube](https://youtu.be/UnAXKxeIlyg) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/081-How-to-get-tweets-from-the-Twitter-API-e45j32)| #### Payload compression attacks How to defend your Server with zip Bombs https://www.sitepoint.com/how-to-defend-your-website-with-zip-bombs/ #### Implementation Frameworks Jersey: Tutorial – REST API design and implementation in Java with Jersey and Spring: https://www.codepedia.org/ama/tutorial-rest-api-design-and-implementation-in-java-with-jersey-and-spring/ Swagger: Jersey vs Swagger: Spring Framework: When to use Spring or Jersey: #### OAuth security ### Apache Nifi Nifi is one of these tools that I identify as high potential tools. It allows you to create a data pipeline very easily. Read data from a RestAPI and post it to Kafka? No problem Read data from Kafka and put it into a database? No problem It's super versatile and you can do everything on the UI. I use it in Part 3 of this Document. Check it out. Check out the Apache Nifi FAQ website. Also look into the documentation to find all possible data sources and sinks of Nifi: Here's a great blog about Nifi: ### Logstash ### FluentD Data Collector https://www.fluentd.org/ ### Apache Flume https://flume.apache.org/ ### Sqoop https://sqoop.apache.org/ ### Azure IoTHub https://azure.microsoft.com/en-us/services/iot-hub/ ## Buffer ### Apache Kafka #### Why a message queue tool? #### Kafka architecture #### What are topics #### What does Zookeeper have to do with Kafka #### How to produce and consume messages My YouTube video how to set up Kafka at home: My YouTube video how to write to Kafka: #### KAFKA Commands Start Zookeeper container for Kafka: docker run -d --name zookeeper-server \ --network app-tier \ -e ALLOW_ANONYMOUS_LOGIN=yes \ bitnami/zookeeper:latest Start Kafka container: docker run -d --name kafka-server \ --network app-tier \ -e KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper-server:2181 \ -e ALLOW_PLAINTEXT_LISTENER=yes \ bitnami/kafka:latest ### Redis Pub-Sub ### AWS Kinesis ### Google Cloud PubSub ## Processing Frameworks ### Lambda and Kappa Architecture | Podcast Episode: #077 Lambda Architecture and Kappa Architecture |------------------| |In this stream we talk about the lambda architecture with stream and batch processing as well as a alternative the Kappa Architecture that consists only of streaming. Also Data engineer vs data scientist and we discuss Andrew Ng’s AI Transformation Playbook. | [Watch on YouTube](https://youtu.be/iUOQPyHN9-0) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/077-Lambda--Kappa-Architecture-e45j0r)| ### Batch Processing Ask the big questions. Remember your last yearly tax statement? You break out the folders. You run around the house searching for the receipts. All that fun stuff. When you finally found everything you fill out the form and send it on its way. Doing the tax statement is a prime example of a batch process. Data comes in and gets stored, analytics loads the data from storage and creates an output (insight): ![Batch Processing Pipeline](/images/Simple-Batch-Processing-Workflow.jpg) Batch processing is something you do either without a schedule or on a schedule (tax statement). It is used to ask the big questions and gain the insights by looking at the big picture. To do so, batch processing jobs use large amounts of data. This data is provided by storage systems like Hadoop HDFS. They can store lots of data (petabytes) without a problem. Results from batch jobs are very useful, but the execution time is high. Because the amount of used data is high. It can take minutes or sometimes hours until you get your results. ### Stream Processing Gain instant insight into your data. Streaming allows users to make quick decisions and take actions based on "real-time" insight. Contrary to batch processing, streaming processes data on the fly, as it comes in. With streaming you don't have to wait minutes or hours to get results. You gain instant insight into your data. In the batch processing pipeline, the analytics was after the data storage. It had access to all the available data. Stream processing creates insight before the data storage. It has only access to fragments of data as it comes in. As a result the scope of the produced insight is also limited. Because the big picture is missing. ![Stream Processing Pipeline](/images/Simple-Stream-Processing-Workflow.jpg) Only with streaming analytics you are able to create advanced services for the customer. Netflix for instance incorporated stream processing into Chuckwa V2.0 and the new Keystone pipeline. One example of advanced services through stream processing is the Netflix "Trending Now" feature. Check out the Netflix case study. #### Three methods of streaming In stream processing sometimes it is ok to drop messages, other times it is not. Sometimes it is fine to process a message multiple times, other times that needs to be avoided like hell. Today's topic are the different methods of streaming: At most once, at least once and exactly once. What this means and why it is so important to keep them in mind when creating a solution. That is what you will find out in this article. #### At Least Once At least once, means a message gets processed in the system once or multiple times. So with at least once it's not possible that a message gets into the system and is not getting processed. It's not getting dropped or lost somewhere in the system. One example where at least once processing can be used is when you think about a fleet management of cars. You get GPS data from cars and that data is transmitted with a timestamp and the GPS coordinates. It's important that you get the GPS data at least once, so you know where the car is. If you're processing this data multiple times, it always has the the timestamp with it. Because of that it does not matter that it gets processed multiple times, because of the timestamp. Or that it would be stored multiple times, because it would just override the existing one. #### At Most Once The second streaming method is at most once. At most once means that it's okay to drop some information, to drop some messages. But it's important that a message is only processed once as a maximum. A example for this is event processing. Some event is happening and that event is not important enough, so it can be dropped. It doesn't have any consequences when it gets dropped. But when that event happens it's important that it does not get processed multiple times. Then it would look as if the event happened five or six times instead of only one. Think about engine misfires. If it happens once, no big deal. But if the system tells you it happens a lot you will think you have a problem with your engine. #### Exactly Once Another thing is exactly once, this means it's not okay to drop data, it's not okay to lose data and it's also not okay to process data multiple times. An example for this is banking. When you think about credit card transactions it's not okay to drop a transaction. When dropped, your payment is not going through. It's also not okay to have a transaction processed multiple times, because then you are paying multiple times. #### Check The Tools! All of this sounds very simple and logical. What kind of processing is done has to be a requirement for your use case. It needs to be thought about in the design process, because not every tool is supporting all three methods. Very often you need to code your application very differently based on the streaming method. Especially exactly once is very hard to do. So, the tool of data processing needs to be chosen based on if you need exactly once, at least once or if you need at most once. ### Should you do stream or batch processing? It is a good idea to start with batch processing. Batch processing is the foundation of every good big data platform. A batch processing architecture is simple, and therefore quick to set up. Platform simplicity means, it will also be relatively cheap to run. A batch processing platform will enable you to quickly ask the big questions. They will give you invaluable insight into your data and customers. When the time comes and you also need to do analytics on the fly, then add a streaming pipeline to your batch processing big data platform. ### Is ETL still relevant for Analytics? | Podcast Episode: #039 Is ETL Dead For Data Science & Big Data? |------------------| |Is ETL dead in Data Science and Big Data? In today’s podcast I share with you my views on your questions regarding ETL (extract, transform, load). Is ETL still practiced or did pre-processing & cleansing replace it. What would replace ETL in Data Engineering. | [Watch on YouTube](https://youtu.be/leSOWPaNkl4) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/Is-ETL-Dead-For-Data-Science--Big-Data---PoDS-039-e2b604)| ### MapReduce Since the early days of the Hadoop eco system, the MapReduce framework is one of the main components of Hadoop alongside HDFS. Google for instance used MapReduce to analyse stored HTML content of websites through counting all the HTML tags and all the words and combinations of them (for instance headlines). The output was then used to create the page ranking for Google Search. That was when everybody started to optimise his website for the google search. Serious search engine optimisation was born. That was the year 2004. How MapReduce is working is, that it processes data in two phases: The map phase and the reduce phase. In the map phase, the framework is reading data from HDFS. Each dataset is called an input record. Then there is the reduce phase. In the reduce phase, the actual computation is done and the results are stored. The storage target can either be a database or back HDFS or something else. After all it's Java -- so you can implement what you like. The magic of MapReduce is how the map and reduce phase are implemented and how both phases are working together. The map and reduce phases are parallelised. What that means is, that you have multiple map phases (mappers) and reduce phases (reducers) that can run in parallel on your cluster machines. Here's an example how such a map and reduce process works with data: ![Mapping of input files and reducing of mapped records](/images/MapReduce-Process-Detailed.jpg) #### How does MapReduce work First of all, the whole map and reduce process relies heavily on using key-value pairs. That's what the mappers are for. In the map phase input data, for instance a file, gets loaded and transformed into key-value pairs. When each map phase is done it sends the created key-value pairs to the reducers where they are getting sorted by key. This means, that an input record for the reduce phase is a list of values from the mappers that all have the same key. Then the reduce phase is doing the computation of that key and its values and outputting the results. How many mappers and reducers can you use in parallel? The number of parallel map and reduce processes depends on how many CPU cores you have in your cluster. Every mapper and every reducer is using one core. This means that the more CPU cores you actually have, the more mappers you can use, the faster the extraction process can be done. The more reducers you are using the faster the actual computation is being done. To make this more clear, I have prepared an example: #### Example As I said before, MapReduce works in two stages, map and reduce. Often these stages are explained with a word count task. Personally, I hate this example because counting stuff is to trivial and does not really show you what you can do with MapReduce. Therefore, we are going to use a more real world use-case from the IoT world. IoT applications create an enormous amount of data that has to be processed. This data is generated by physical sensors who take measurements, like room temperature at 8 o'clock. Every measurement consists of a key (the timestamp when the measurement has been taken) and a value (the actual value measured by the sensor). Because you usually have more than one sensor on your machine, or connected to your system, the key has to be a compound key. Compound keys contain in addition to the measurement time information about the source of the signal. But, let's forget about compound keys for now. Today we have only one sensor. Each measurement outputs key-value pairs like: Timestamp-Value. The goal of this exercise is to create average daily values of that sensor's data. The image below shows how the map and reduce process works. First, the map stage loads unsorted data (input records) from the source (e.g. HDFS) by key and value (key:2016-05-01 01:02:03, value:1). Then, because the goal is to get daily averages, the hour:minute:second information is cut from the timestamp. That is all that happens in the map phase, nothing more. After all parallel map phases are done, each key-value pair gets sent to the one reducer who is handling all the values for this particular key. Every reducer input record then has a list of values and you can calculate (1+5+9)/3, (2+6+7)/3 and (3+4+8)/3. That's all. ![MapReduce Example of Time Series Data](/images/MapReduce-Time-Series-example.jpg) What do you think you need to do to generate minute averages? Yes, you need to cut the key differently. You then would need to cut it like this: "2016-05-01 01:02", keeping the hour and minute information in the key. What you can also see is, why map reduce is so great for doing parallel work. In this case, the map stage could be done by nine mappers in parallel because each map is independent from all the others. The reduce stage could still be done by three tasks in parallel. One for orange, one for blue and one for green. That means, if your dataset would be 10 times as big and you'd have 10 times the machines, the time to do the calculation would be the same. #### What is the limitation of MapReduce? MapReduce is awesome for simpler analytics tasks, like counting stuff. It just has one flaw: It has only two stages Map and Reduce. ![The Map Reduce Process](/images/MapReduce-Process.jpg) First MapReduce loads the data from HDFS into the mapping function. There you prepare the input data for the processing in the reducer. After the reduce is finished the results get written to the data store. The problem with MapReduce is that there is no simple way to chain multiple map and reduce processes together. At the end of each reduce process the data must be stored somewhere. This fact makes it very hard to do complicated analytics processes. You would need to chain MapReduce jobs together. Chaining jobs with storing and loading intermediate results just makes no sense. Another issue with MapReduce is that it is not capable of streaming analytics. Jobs take some time to spin up, do the analytics and shut down. Basically Minutes of wait time are totally normal. This is a big negative point in a more and more real time data processing world. ### Apache Spark I talked about the three methods of data streaming in this podcast: #### What is the difference to MapReduce? Spark is a complete in-memory framework. Data gets loaded from, for instance HDFS, into the memory of workers. There is no longer a fixed map and reduce stage. Your code can be as complex as you want. Once in memory, the input data and the intermediate results stay in memory (until the job finishes). They do not get written to a drive like with MapReduce. This makes Spark the optimal choice for doing complex analytics. It allows you for instance to do iterative processes. Modifying a dataset multiple times in order to create an output is totally easy. Streaming analytics capability is also what makes Spark so great. Spark has natively the option to schedule a job to run every X seconds or X milliseconds. As a result, Spark can deliver you results from streaming data in "real time". #### How does Spark fit to Hadoop? There are some very misleading articles out there titled \"Spark or Hadoop\", \"Spark is better than Hadoop\" or even \"Spark is replacing Hadoop\". So, it's time to show you the differences between Spark and Hadoop. After this you will know when and for what you should use Spark and Hadoop. You'll also understand why \"Hadoop or Spark\" is the totally wrong question. #### Where's the difference? To make it clear how Hadoop differs from Spark I created this simple feature table: ![Hadoop vs Spark capabilities](/images/Table-Hadoop-and-Spark.jpg) Hadoop is used to store data in the Hadoop Distributed File System (HDFS). It can analyse the stored data with MapReduce and manage resources with YARN. However, Hadoop is more than just storage, analytics and resource management. There's a whole eco system of tools around the Hadoop core. I've written about its eco system in this article: [missing](missing) What is Hadoop and why is it so freakishly popular. You should check it out as well. Compared to Hadoop, Spark is "just" an analytics framework. It has no storage capability. Although it has a standalone resource management, you usually don't use that feature. #### Spark and Hadoop is a perfect fit So, if Hadoop and Spark are not the same things, can they work together? Absolutely! Here's how the first picture will look if you combine Hadoop with Spark: missing As Storage you use HDFS. Analytics is done with Apache Spark and YARN is taking care of the resource management. Why does that work so well together? From a platform architecture perspective, Hadoop and Spark are usually managed on the same cluster. This means on each server where a HDFS data node is running, a Spark worker thread runs as well. In distributed processing, network transfer between machines is a large bottle neck. Transferring data within a machine reduces this traffic significantly. Spark is able to determine on which data node the needed data is stored. This allows a direct load of the data from the local storage into the memory of the machine. This reduces network traffic a lot. #### Spark on YARN: You need to make sure that your physical resources are distributed perfectly between the services. This is especially the case when you run Spark workers with other Hadoop services on the same machine. It just would not make sense to have two resource managers managing the same server's resources. Sooner or later they will get in each others way. That's why the Spark standalone resource manager is seldom used. So, the question is not Spark or Hadoop. The question has to be: Should you use Spark or MapReduce alongside Hadoop's HDFS and YARN. #### My simple rule of thumb: If you are doing simple batch jobs like counting values or doing calculating averages: Go with MapReduce. If you need more complex analytics like machine learning or fast stream processing: Go with Apache Spark. #### Available Languages Spark jobs can be programmed in a variety of languages. That makes creating analytic processes very user-friendly for data scientists. Spark supports Python, Scala and Java. With the help of SparkR you can even connect your R program to a Spark cluster. If you are a data scientist who is very familiar with Python just use Python, its great. If you know how to code Java I suggest you start using Scala. Spark jobs are easier to code in Scala than in Java. In Scala you can use anonymous functions to do processing. This results in less overhead, it is a much cleaner, simpler code. With Java 8 simplified function calls were introduced with lambda expressions. Still, a lot of people, including me prefer Scala over Java. #### How Spark works: Driver, Executor, Sparkcontext | Podcast Episode: #100 Apache Spark Week Day 1 |------------------| |Is ETL dead in Data Science and Big Data? In today’s podcast I share with you my views on your questions regarding ETL (extract, transform, load). Is ETL still practiced or did pre-processing & cleansing replace it. What would replace ETL in Data Engineering. | [Watch on YouTube](https://youtu.be/qD6Wi2pfCx0) #### Spark batch vs stream processing #### How does Spark use data from Hadoop Another thing is data locality. I always make the point, that processing data locally where it is stored is the most efficient thing to do. That's exactly what Spark is doing. You can and should run Spark workers directly on the data nodes of your Hadoop cluster. Spark can then natively identify on what data node the needed data is stored. This enables Spark to use the worker running on the machine where the data is stored to load the data into the memory. ![Spark Using Hadoop Data Locality](/images/Spark-Data-Locality.jpg) The downside of this setup is that you need more expensive servers. Because Spark processing needs stronger servers with more RAM and CPUs than a "pure" Hadoop setup. #### What are RDDs and how to use them RDDs are the core part of Spark. I learned and used RDDs first. It felt familiar coming from MapReduce. Nowadays you use Dataframes or Datasets. I still find it valuable to learn how RDDs and therefore Spark works at a lower level. | Podcast Episode: #101 Apache Spark Week Day 2 |------------------| |On day two of the Apache Spark week we take a look at major Apache Spark concepts: RDDs, transformations and actions, caching and broadcast variables. | [Watch on YouTube](https://youtu.be/9I6mA2W6_HU) #### How and why to use SparkSQL? When you use Apache Zeppelin notebooks to learn Spark you automatically come across SparkSQL. SparkSQL allows you to access Dataframes with SQL like queries. Especially when you work with notebooks it is very handy to create charts from your data. You can learn from mistakes easier than just deploying Spark applications. | Podcast Episode: #102 Apache Spark Week Day 3 |------------------| | We continue the Spark week, hands on. We do a full example from reading a csv, doing maps and flatmaps, to writing to disk. We also use SparkSQL to visualize the data. | [Watch on YouTube](https://youtu.be/Fk-s8eKD4ZI) #### What are DataFrames how to use them As I said before. Dataframes are the successors to RDDs. It's the new Spark API. Dataframes are basically lake Tables in a SQL Database or like an Excel sheet. This makes them very simple to use and manipulate with SparkSQL. I highly recommend to go this route. Processing with Dataframes is even faster then with RDDs, because it uses optimization alogrithms for the data processing. | Podcast Episode: #103 Apache Spark Week Day 4 |------------------| |We look into Dataframes, Dataframes and Dataframes. | [Watch on YouTube](https://youtu.be/9I6mA2W6_HU) #### Machine Learning on Spark? (Tensor Flow) Wouldn't it be great to use your deep learning TensorFlow applications on Spark? Yes, it is already possible. Check out these Links: Why do people integrate Spark with TensorFlow even if there is a distributed TensorFlow framework? TensorFlow On Spark: Scalable TensorFlow Learning on Spark Clusters: Deep Learning with Apache Spark and TensorFlow: #### MLlib: The machine learning library MLlib is included in Spark so there is often no need to import another library. I have to admit because I am not a data scientist I am not an expert in machine learning. From what I have seen and read though the machine learning framework MLlib is a nice treat for data scientists wanting to train and apply models with Spark. #### Spark Setup From a solution architect's point of view Spark is a perfect fit for Hadoop big data platforms. This has a lot to do with cluster deployment and management. Companies like Cloudera, MapR or Hortonworks include Spark into their Hadoop distributions. Because of that, Spark can be deployed and managed with the clusters Hadoop management web fronted. This makes the process for deploying and configuring a Spark cluster very quick and admin friendly. #### Spark Resource Management When running a computing framework you need resources to do computation: CPU time, RAM, I/O and so on. Out of the box Spark can manage resources with it's stand-alone resource manager. If Spark is running in an Hadoop environment you don't have to use Spark's own stand-alone resource manager. You can configure Spark to use Hadoop's YARN resource management. Why would you do that? It allows YARN to efficiently allocate resources to your Hadoop and Spark processes. Having a single resource manager instead of two independent ones makes it a lot easier to configure the resource management. ![Spark Resource Management With YARN](/images/Spark-Yarn.jpg) ### Samza [Link to Apache Samza Homepage](http://samza.apache.org/) ### AWS Lambda [Link to AWS Lambda Homepage](https://aws.amazon.com/lambda/) ### Apache Flink [Link to Apache Flink Homepage](https://flink.apache.org/) ### Elasticsearch [Link to Elatsicsearch Homepage](https://www.elastic.co/products/elastic-stack) ### Graph DB Graph databases store data in terms of nodes and relationships. Each node represents an entity (people, movies, things and other data points) and a relationship represents how the nodes are related. They are designed to store and treat the relationships with the same importance of that of the data (or nodes in this case). This relationship-first approach makes a lot of difference as the relationship between data need not be inferred anymore with foreign and primary keys. Graph databases are especially useful when applications require navigating through multiple and multi-level relationships between various data points. #### Neo4j Neo4j is currently the most popular graph database management system. It is ACID compliant and provides its own implementation of a graph database. In addition to nodes and relationships, neo4j has the following components to enrich the data model with information. • Labels. These are used to group nodes, and each node can be assigned multiple labels. Labels are indexed to speed up finding nodes in a graph. • Properties. These are attributes of both nodes and relationships. Neo4j allows for storing data as key-value pairs, which means properties can have any value (string, number, or boolean). ##### Advantages • Neo4j is schema-free • Highly available and provides transactional guarantees • Cypher is a declarative query language which makes it very easy to navigate the graph • Neo4j is fast and easily traversible because the data is connected and is very easy to query, retrieve and navigate the graph • For the same reason as above, there are no joins in Neo4j ##### Disadvantages • Neo4j is not the best for any kind of aggregations or sorting, in comparison with a relational database • While doable, they are not the best to handle transactional data like accounting • Sharding is currently not supported ##### Use Cases https://neo4j.com/use-cases/ ### Apache Solr [Link to Solr Homepage](https://solr.apache.org) ### Apache Drill [Link to Apache Drill Homepage](https://drill.apache.org) ### Apache Storm https://storm.apache.org/ ### StreamSets ## Store ### Analytical Data Stores #### Data Warehouse vs Data Lake | Podcast Episode: #055 Data Warehouse vs Data Lake |------------------| |On this podcast we are going to talk about data warehouses and data lakes? When do people use which? What are the pros and cons of both? Architecture examples for both Does it make sense to completely move to a data lake? | [Watch on YouTube](https://youtu.be/8gNQTrUUwMk) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/055-Data-Warehouse-vs-Data-Lake-e45iem)| #### Snowflake and dbt ![Snowlfake thumb](/images/03/Snowflake-dbt-thumbnail.jpeg) In the rapidly evolving landscape of data engineering, staying ahead means continuously expanding your skill set with the latest tools and technologies. Among the myriad of options available, dbt (data build tool) and Snowflake have emerged as indispensable for modern data engineering workflows. Understanding and leveraging these tools can significantly enhance your ability to manage and transform data, making you a more effective and valuable data engineer. Let's dive into why dbt and Snowflake should be at the top of your learning list and explore how the "dbt for Data Engineers" and "Snowflake for Data Engineers" courses from the Learn Data Engineering Academy can help you achieve mastery in these tools. ##### The Power of Snowflake in Data Engineering Snowflake has revolutionized the data warehousing space with its cloud-native architecture. It offers a scalable, flexible, and highly performant platform that simplifies data management and analytics. Here’s why Snowflake is a critical skill for data engineers: 1. **Cloud-Native Flexibility:** Snowflake’s architecture allows you to scale resources up or down based on your needs, ensuring optimal performance and cost-efficiency. 2. **Unified Data Platform:** It unifies data silos, enabling seamless data sharing and collaboration across the organization. 3. **Integration Capabilities:** Snowflake integrates with various data tools and platforms, enhancing its versatility in different data workflows. 4. **Advanced Analytics:** With its robust support for data querying, transformation, and integration, Snowflake is ideal for complex analytical workloads. The "Snowflake for Data Engineers" course in my Learn Data Engineering Academy provides comprehensive training on Snowflake. From the basics of setting up your Snowflake environment to advanced data automation with Snowpipes, the course equips you with practical skills to leverage Snowflake effectively in your data projects. Learn more about the course [here](https://learndataengineering.com/p/snowflake-for-data-engineers). ![Snowlfake thumb](/images/03/Snowflake-ui.jpeg) ##### Why dbt is a Game-Changer for Data Engineers dbt is a powerful transformation tool that allows data engineers to transform, test, and document data directly within their data warehouse using simple SQL. Unlike traditional ETL tools, dbt operates on the principle of ELT (Extract, Load, Transform), which aligns perfectly with modern cloud data warehousing paradigms. Here are a few reasons why dbt is a must-have skill for data engineers: 1. **SQL-First Approach:** dbt allows you to write transformations in SQL, the lingua franca of data manipulation, making it accessible to a broad range of data professionals. 2. **Collaboration:** Teams can collaborate seamlessly, creating trusted datasets for reporting, machine learning, and operational workflows. 3. **Ease of Use:** With dbt, you can transform, test, and document your data with ease, streamlining the data pipeline process. 4. **Integration:** dbt integrates effortlessly with your existing data warehouse, such as Snowflake, making it a versatile addition to your toolkit. In my Learn Data Engineering Academy you find the perfect starting point for mastering dbt with the course "dbt for Data Engineers". The course covers everything from the basics of ELT processes to advanced features like continuous integration and deployment (CI/CD) pipelines. With hands-on training, you'll learn to create data pipelines, configure dbt materializations, test dbt models, and much more. Learn more about the course [here](https://learndataengineering.com/p/dbt-for-data-engineers). ![Snowlfake thumb](/images/03/dbt-ui.jpeg) ##### dbt and Snowflake: A Winning Combination When used together, dbt and Snowflake offer a powerful combination for data engineering. Here’s why: 1. **Seamless Integration:** dbt’s SQL-first transformation capabilities integrate perfectly with Snowflake’s scalable data warehousing, creating a streamlined ELT workflow. 2. **Efficiency:** Together, they enhance the efficiency of data transformation and analytics, reducing the time and effort required to prepare data for analysis. 3. **Scalability:** The combined power of dbt’s model management and Snowflake’s dynamic scaling ensures that your data pipelines can handle large and complex datasets with ease. 4. **Collaboration and Documentation:** dbt’s ability to document and test data transformations directly within Snowflake ensures that data workflows are transparent, reliable, and collaborative. Get right into it with our Academy! By integrating Snowflake and dbt into your skill set, you position yourself at the forefront of data engineering innovation. These tools not only simplify and enhance your data workflows but also open up new possibilities for data transformation and analysis. ### Transactional Data Stores #### SQL Databases ##### PostgreSQL DB Homepage: PostgreSQL vs MongoDB: ##### Database Design ##### SQL Queries ##### Stored Procedures ##### ODBC/JDBC Server Connections #### NoSQL Stores ##### KeyValue Stores (HBase) | Podcast Episode: #056 NoSQL Key Value Stores Explained with HBase |------------------| |What is the difference between SQL and NoSQL? In this episode I show you on the example of HBase how a key/value store works. | [Watch on YouTube](https://youtu.be/67hIkbpzFc8) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/056-NoSQL-Key-Value-Stores-Explained-With-HBase-e45ifb)| ##### Document Store HDFS The Hadoop distributed file system, or HDFS, allows you to store files in Hadoop. The difference between HDFS and other file systems like NTFS or EXT is that it is a distributed one. What does that mean exactly? A typical file system stores your data on the actual hard drive. It is hardware dependent. If you have two disks then you need to format every disk with its own file system. They are completely separate. You then decide on which disk you physically store your data. HDFS works different to a typical file system. HDFS is hardware independent. Not only does it span over many disks in a server. It also spans over many servers. HDFS will automatically place your files somewhere in the Hadoop server collective. It will not only store your file, Hadoop will also replicate it two or three times (you can define that). Replication means replicas of the file will be distributed to different servers. ![HDFS Master and Data Nodes](/images/HDFS-Master-DataNodes.jpg) This gives you superior fault tolerance. If one server goes down, then your data stays available on a different server. Another great thing about HDFS is, that there is no limit how big the files can be. You can have server log files that are terabytes big. How can files get so big? HDFS allows you to append data to files. Therefore, you can continuously dump data into a single file without worries. HDFS physically stores files different then a normal file system. It splits the file into blocks. These blocks are then distributed and replicated on the Hadoop cluster. The splitting happens automatically. ![Distribution of Blocks for a 512MB File](/images/HDFS-Distributed-FileSystem.jpg) In the configuration you can define how big the blocks should be. 128 megabyte or 1 gigabyte? No problem at all. This mechanic of splitting a large file in blocks and distributing them over the servers is great for processing. See the MapReduce section for an example. ##### Document Store MongoDB | Podcast Episode: #093 What is MongoDB |------------------| |What is the difference between SQL and NoSQL? In this episode I show you on the example of HBase how a key/value store works. | [Watch on YouTube](https://youtu.be/U05knQN29FA) **Links:** What is MongoDB: Or directly from MongoDB.com: Storage in BSON files: Hello World in MongoDB: Real-Time Analytics on MongoDB Data in Power BI: Spark and MongoDB: MongoDB vs Time Series Database: Fun article titled why you should never use mongodb: MongoDB vs Cassandra: ##### Elasticsearch Search Engine and Document Store Elasticsearch is not a DB but firstly a search engine that indexes JSON documents. | Podcast Episode: #095 What is Elasticsearch & Why is It So Popular? |------------------| |Elasticsearch is a super popular tool for indexing and searching data. On this stream we check out how it works, architectures and what to use it for. There must be a reason why it is so popular. | [Watch on YouTube](https://youtu.be/hNb5zB4OPXM) Links: Great example for architecture with Elasticsearch, Logstash and Kibana:\ Introduction to Elasticsearch in the documentation:\ Working with JSON documents:\ JSONs need to be flattened heres how to work with nested objects in the JSON:\ Indexing basics:\ How to do searches with search API:\ General recommendations when working with Elasticsearch:\ JSON document example and intro to Kibana:\ How to connect Tableau to Elasticsearch:\ Benchmarks how fast Elasticsearch is:\ Elasticsearch vs MongoDB quick overview:\ Logstash overview (preprocesses data before insert into Elasticsearch) X-Pack Security for Elasticsearch:\ Google Trends Grafana vs Kibana:\ ##### Apache Impala [Apache Impala Homepage](https://impala.apache.org/) ##### Kudu ##### Apache Druid | Podcast Episode: Druid NoSQL DB and Analytics DB Introduction |------------------| |In this video I explain what Druid is and how it works. We look into the architecture of a Druid cluster and check out how Clients access the data. |[Watch on YouTube](https://youtu.be/EiEIeBXSWjM) ##### InfluxDB Time Series Database What is time-series data? Key concepts: InfluxDB and Spark Streaming Building a Streaming application with spark, grafana, chronogram and influx: Performance Dashboard Spark and InfluxDB: Other alternatives for time series databases are: DalmatinerDB, QuestDB, Prometheus, Riak TS, OpenTSDB, KairosDB ##### MPP Databases (Greenplum) ##### Azure Cosmos DB https://azure.microsoft.com/en-us/services/cosmos-db/ ##### Azure Table-Storage https://azure.microsoft.com/en-us/services/storage/tables/ #### NoSQL Data warehouse ##### Hive Warehouse ##### Impala ## Visualize ### Android & IOS ### How to design APIs for mobile apps ### How to use Webservers to display content ### Dashboards #### Grafana #### Kibana #### Tomcat #### Jetty #### NodeRED #### React ### Business Intelligence Tools #### Tableau #### PowerBI #### Quliksense ### Identity & Device Management #### What is a digital twin? #### Active Directory Machine Learning ---------------- | Podcast Episode: Machine Learning In Production |------------------| |Doing machine learning in production is very different than for proof of concepts or in education. One of the hardest parts is keeping models updated. | [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/Machine-Learning-In-Production-e11bbk) ### How to do Machine Learning in production Machine learning in production is using stream and batch processing. In the batch processing layer you are creating the models, because you have all the data available for training. In the stream in processing layer you are using the created models, you are applying them to new data. The idea that you need to incorporate is that it is a constant cycle. Training, applying, re-training, pushing into production and applying. What you don't want to do is doing this manually. You need to figure out a process of automatic retraining and automatic pushing to into production of models. In the retraining phase the system automatically evaluates the training. If the model no longer fits it works as long as it needs to create a good model. After the evaluation of the model is complete and it's good, the model gets pushed into production. Into the stream processing. ### Why machine learning in production is harder then you think How to automate machine learning is something that drives me day in and day out. What you do in development or education is, that you create a model and fit it to the data. Then that model is basically done forever. Where I'm coming from, the IoT world, the problem is that machines are very different. They behave very different and experience wear. ### Models Do Not Work Forever Machines have certain processes that decrease the actual health of the machine. Machine wear is a huge issue. Models that that are built on top of a good machine don't work forever. When the Machine wears out, the models need to be adjusted. They need to be maintained, retrained. ### Where The Platforms That Support This? Automatic re-training and re-deploying is a very big issue, a very big problem for a lot of companies. Because most existing platforms don't have this capability (I actually haven't seen one until now). Look at AWS machine learning for instance. The process is: build, train, tune deploy. Where's the loop of retraining? You can create models and then use them in production. But this loop is almost nowhere to be seen. It is a very big issue that needs to be solved. If you want to do machine learning in production you can start with manual interaction of the training, but at some point you need to automate everything. ### Training Parameter Management To train a model you are manipulating input parameters of the models. Take deep learning for instance. To train you are manipulating for instance: \- How many layers do you use. - The depth of the layers, which means how many neurons you have in a layer. - What activation function you use, how long are you training and so on. You also need to keep track of what data you used to train which model. All those parameters need to be manipulated automatically, models trained and tested. To do all that, you basically need a database that keeps track of those variables. How to automate this, for me, is like the big secret. I am still working on figuring it out. ### What's Your Solution? Did you already have the problem of automatic re-training and deploying of models as well? Were you able to use a cloud platform like Google, AWS or Azure? It would be really awesome if you share your experience :) ### How to convince people machine learning works Many people still are not convinced that machine learning works reliably. But they want analytics insight and most of the time machine learning is the way to go. This means, when you are working with customers you need to do a lot of convincing. Especially if they are not into machine learning themselves. But it's actually quite easy. ### No Rules, No Physical Models Many people are still under the impression that analytics only works when it's based on physics. When there are strict mathematical rules to a problem. Especially in engineering heavy countries like Germany this is the norm: "Sere has to be a Rule for Everysing!" (imagine a German accent). When you're engineering you are calculating stuff based on physics and not based on data. If you are constructing an airplane wing, you better make sure to use calculations so it doesn't fall off. And that's totally fine. Keep doing that! Machine learning has been around for decades. It didn't quite work as good as people hoped. We have to admit that. But there is this preconception that it still doesn't work. Which is not true: Machine learning works. Somehow you need to convince people that it is a viable approach. That learning from data to make predictions is working perfectly. ### You Have The Data. USE IT! As a data scientist you have one ace up your sleeve, it's the obvious one: It's the data and it's statistics. You can use that data and those statistics to counter peoples preconceptions. It's very powerful if someone says: "This doesn't work" You bring the data. You show the statistics and you show that it works reliably. A lot of discussions end there. Data doesn't lie. You can't fight data. The data is always right. ### Data is Stronger Than Opinions This is also why I believe that autonomous driving will come quicker than many of us think. Because a lot of people say, they are not safe. That you cannot rely on those cars. The thing is: When you have the data you can do the statistics. You can show people that autonomous driving really works reliably. You will see, the question of \"Is this allowed or is this not allowed?\" will be gone quicker than you think. Because government agencies can start testing the algorithms based on predefined scenarios. They can run benchmarks and score the cars performance. All those opinions, if it works, or if it doesn't work, they will be gone. The motor agency has the statistics. The stats show people how good cars work. Companies like Tesla, they have it very easy. Because the data is already there. **They just need to show us that the algorithms work. The end.** ### AWS Sagemaker Train and apply models online with Sagemaker Link to the OLX Slideshare with pros, cons and how to use Sagemaker: ================================================ FILE: sections/04-HandsOnCourse.md ================================================ Data Engineering Course: Building A Data Platform ================================================= ## Contents - [GenAI Retrieval Augmented Generation with Ollama and Elasticsearch](04-HandsOnCourse.md#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch) - [Free Data Engineering Course with AWS, TDengine, Docker and Grafana](04-HandsOnCourse.md#free-data-engineering-course-with-aws-tdengine-docker-and-grafana) - [Monitor your data in dbt & detect quality issues with Elementary](04-HandsOnCourse.md#monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary) - [Solving Engineers 4 Biggest Airflow Problems](04-HandsOnCourse.md#solving-engineers-4-biggest-airflow-problems) - [The best alternative to Airlfow? Mage.ai](04-HandsOnCourse.md#the-best-alternative-to-airlfow?-mage.ai) ## GenAI Retrieval Augmented Generation with Ollama and Elasticsearch - This how-to is based on this one from Elasticsearch: https://www.elastic.co/search-labs/blog/rag-with-llamaIndex-and-elasticsearch - Instead of Elasticsearch cloud we're going to run everything locally - The simplest way to get this done is to just clone this GitHub Repo for the code and docker setup - I've tried this on a M1 Mac. Changes for Windows with WSL will come later. - The biggest problems that I had were actually installing the dependencies rather than the code itself. ### Install Ollama 1. Download Ollama from here https://ollama.com/download/mac 2. Unzip, drag into applications and install 3. do `ollama run mistral` (It's going to download the Mistral 7b model, 4.1GB size) 4. Create a new folder in Documents "Elasticsearch-RAG" 5. Open that folder in VSCode ### Install Elasticsearch & Kibana (Docker) 1. Use the docker-compose file from this repo: https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/docker-compose.yml 2. Download Docker Desktop from here: https://www.docker.com/products/docker-desktop/ 3. Install docker desktop and sign in in the app/create a user -> sends you to the browser **For Windows Users** Configure WSL2 to use max only 4GB of ram: ``` wsl --shutdown notepad "$env:USERPROFILE/.wslconfig" ``` .wslconfig file: ``` [wsl2] memory=4GB # Limits VM memory in WSL 2 up to 4GB ``` **Modify the Linux kernel map count in WSL** Do this before the start because Elasticsearch requires a higher value to work `sudo sysctl -w vm.max_map_count=262144` 4. go to the Elasticsearch-RAG folder and do `docker compose up` 5. make sure you have Elasticsearch 8.11 or later (we use 8.16 here in this project) if you want to use your own Elasticsearch image 6. if you get this error on a mac then just open the console in the docker app: *error getting credentials - err: exec: docker-credential-desktop: executable file not found in $PATH, out:* 7. Install xcode command line tools: `xcode-select --install` 8. make sure you're at python 3.8.1 or larger -> installed 3.13.0 from https://www.python.org/downloads/ ### Setup the virtual Python environment #### preparation on a Mac ##### install brew which brew /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" export PATH="/opt/homebrew/bin:$PATH" brew --version brew install pyenv brew install pyenv-virtualenv ##### install pyenv ``` brew install pyenv brew install pyenv-virtualenv ``` Modify the path so that pyenv is in the path variable `nano ~/.zshrc` ``` export PYENV_ROOT="$HOME/.pyenv" export PATH="$PYENV_ROOT/bin:$PATH" eval "$(pyenv init --path)" eval "$(pyenv init -)" eval "$(pyenv virtualenv-init -)" ``` install dependencies for building python versions `brew install openssl readline sqlite3 xz zlib` Reload to apply changes `source ~/.zshrc` install python ``` pyenv install 3.11.6 pyenv version ``` Set Python version system wide `pyenv global 3.11.6` ``` pyenv virtualenv pyenv activate pyenv virtualenv-delete ``` #### Windows without pyenv setup virtual python environment - go to the Elasticsearch-RAG folder and do `python3 -m venv .elkrag` enable the environment `source .elkrag/bin/activate` ### Install required libraries (do one at a time so you see errors): ``` pip install llama-index (optional python3 -m pip install package name) pip install llama-index-embeddings-ollama pip install llama-index-llms-ollama pip install llama-index-vector-stores-elasticsearch pip install python-dotenv ``` ### Write the data to Elasticsearch 1. create / copy in the index.py file 2. download the conversations.json file from the folder code examples/GenAI-RAG 3. if you get an error with the execution then check if pedantic version is <2.0 `pip show pydantic` if not do this: `pip install "pydantic<2.0` 4. run the program index.py: https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/index.py ### Check the data in Elasticsearch 1. go to kibana http://localhost:5601/app/management/data/index_management/indices and see the new index called calls 2. go to dev tools and try out this query `GET calls/_search?size=1 http://localhost:5601/app/dev_tools#/console/shell` ### Query data from elasticsearch and create an output with Mistral 1. if everything is good then run the query.py file https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/query.py 2. try a few queries :) ### Install libraries to extract text from pdfs ### Extract data from CV and put it into Elasticsearch I created a CV with ChatGPT https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/Liam_McGivney_CV.pdf Install the library to extract text from the pdf `pip install PyMuPDF` I had to Shift+Command+p then python clear workspace cache and reload window. Then it saw it :/ The file cvpipeline.py has the python code for the indexing. It's not working right now though! https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py I'll keep developing this and update it once it's working. ## Free Data Engineering Course with AWS TDengine Docker and Grafana **Free hands-on course:** [Watch on YouTube](https://youtu.be/eoj-CnrR9jA) In this detailed tutorial video, Andreas guides viewers through creating an end-to-end data pipeline using time series data. The project focuses on fetching weather data from a Weather API, processing it on AWS, storing it in TDengine (a time series database), and visualizing the data with Grafana. Here's a concise summary of what the video covers: 1. **Introduction and Setup:** - The project is introduced along with a GitHub repository containing all necessary resources and a step-by-step guide. - The pipeline architecture includes an IoT weather station, a Weather API, AWS for processing, TDengine for data storage, and Grafana for visualization. 2. **Project Components:** - **Weather API:** Utilizes weatherapi.com to fetch weather data. - **AWS Lambda:** Processes the data fetched from the Weather API. - **TDengine:** Serves as the time series database to store processed data. It's highlighted for its performance and simplicity, especially for time series data. - **Grafana:** Used for creating dashboards to visualize the time series data. 3. **Development and Deployment:** - The local development environment setup includes Python, Docker, and VS Code. - The tutorial covers the creation of a Docker image for the project and deploying it to AWS Elastic Container Registry (ECR). - AWS Lambda is then configured to use the Docker image from ECR. - AWS EventBridge is used to schedule the Lambda function to run at specified intervals. 4. **Time Series Data:** - The importance of time series data and the benefits of using a time series database like TDengine over traditional relational databases are discussed. - TDengine's features such as speed, scaling, data retention, and built-in functions for time series data are highlighted. 5. **Building the Pipeline:** - Detailed instructions are provided for setting up each component of the pipeline: - Fetching weather data from the Weather API. - Processing and sending the data to TDengine using an AWS Lambda function. - Visualizing the data with Grafana. - Each step includes code snippets and configurations needed to implement the pipeline. 6. **Conclusion:** - The video concludes with a demonstration of the completed pipeline, showing weather data visualizations in Grafana. - Viewers are encouraged to replicate the project using the resources provided in the GitHub repository linked in the video description. This video provides a comprehensive guide to building a data pipeline with a focus on time series data, demonstrating the integration of various technologies and platforms to achieve an end-to-end solution. ## Monitor your data in dbt and detect quality issues with Elementary **Free hands-on tutorial:** [Watch on YouTube](https://youtu.be/6fnU91Q2gq0) In this comprehensive tutorial, Andreas delves into the integration of dbt (data build tool) with Elementary to enhance data monitoring and quality detection within Snowflake databases. The tutorial is structured to guide viewers through a hands-on experience, starting with an introduction to a sample project setup and the common challenges faced in monitoring dbt jobs. It then transitions into how Elementary can be utilized to address these challenges effectively. Key learning points and tutorial structure include: 1. **Introduction to the Sample Project:** Andreas showcases a project setup involving Snowflake as the data warehouse, dbt for data modeling and testing, and a visualization tool for data analysis. This setup serves as the basis for the tutorial. 2. **Challenges in Monitoring dbt Jobs:** Common issues in monitoring dbt jobs are discussed, highlighting the limitations of the dbt interface in providing comprehensive monitoring capabilities. 3. **Introduction to Elementary:** Elementary is introduced as a dbt-native data observability tool designed to enhance the monitoring and analysis of dbt jobs. It offers both open-source and cloud versions, with the tutorial focusing on the cloud version. 4. **Setup Requirements:** The tutorial covers the necessary setup on both the Snowflake and dbt sides, including schema creation, user and role configuration in Snowflake, and modifications to the dbt project for integrating with Elementary. 5. **Elementary's User Interface and Features:** A thorough walkthrough of Elementary's interface is provided, showcasing its dashboard, test results, model runs, data catalog, and data lineage features. The tool's ability to automatically run additional tests, like anomaly detection and schema change detection, is also highlighted. 6. **Advantages of Using Elementary:** The presenter outlines several benefits of using Elementary, such as easy implementation, native test integration, clean and straightforward UI, and enhanced privacy due to data being stored within the user's data warehouse. 7. **Potential Drawbacks:** Some potential drawbacks are discussed, including the additional load on dbt job execution due to more models being run and limitations in dashboard customization. 8. **Summary and Verdict:** The tutorial concludes with a summary of the key features and benefits of using Elementary with dbt, emphasizing its value in improving data quality monitoring and detection. Overall, viewers are guided through setting up and utilizing Elementary for dbt data monitoring, gaining insights into its capabilities, setup process, and the practical benefits it offers for data quality assurance. ## Solving Engineers 4 Biggest Airflow Problems **Free hands-on tutorial:** [Watch on YouTube](https://youtu.be/b9bMNEh8bes) In this informative video, Andreas discusses the four major challenges engineers face when working with Apache Airflow and introduces Astronomer, a managed Airflow service that addresses these issues effectively. Astronomer is highlighted as a solution that simplifies Airflow deployment and management, making it easier for engineers to develop, deploy, and monitor their data pipelines. Here's a summary of the key points discussed for each challenge and how Astronomer provides solutions: 1. Managing Airflow Deployments: - **Challenge:** Setting up and maintaining Airflow deployments is complex and time-consuming, involving configuring cloud instances, managing resources, scaling, and updating the Airflow system. - **Solution with Astronomer:** Offers a straightforward deployment process where users can easily configure their deployments, choose cloud providers (GCP, AWS, Azure), and set up scaling with just a few clicks. Astronomer handles the complexity, making it easier to manage production and quality environments. 2. Development Environment and Deployment: - **Challenge:** Local installation of Airflow is complicated due to its dependency on multiple Docker containers and the need for extensive configuration. - **Solution with Astronomer:** Provides a CLI tool for setting up a local development environment with a single command, simplifying the process of developing, testing, and deploying pipelines. The Astronomer CLI also helps in initializing project templates and deploying Dags to the cloud effortlessly. 3. Source Code Management and CI/CD Pipelines: - **Challenge:** Collaborative development and continuous integration/deployment (CI/CD) are essential but challenging to implement effectively with Airflow alone. - **Solution with Astronomer:** Facilitates easy integration with GitHub for source code management and GitHub Actions for CI/CD. This allows automatic testing and deployment of pipeline code, ensuring a smooth workflow for teams working on pipeline development. 4. Observing Pipelines and Alarms: - **Challenge:** Monitoring data pipelines and getting timely alerts when issues occur is crucial but often difficult to achieve. - **Solution with Astronomer:** The Astronomer platform provides a user-friendly interface for monitoring pipeline status and performance. It also offers customizable alerts for failures or prolonged task durations, with notifications via email, PagerDuty, or Slack, ensuring immediate awareness and response to issues. Overall, the video shows Astronomer as a powerful and user-friendly platform that addresses the common challenges of using Airflow, from deployment and development to collaboration, CI/CD, and monitoring. It suggests that Astronomer can significantly improve the experience of engineers working with Airflow, making it easier to manage, develop, and monitor data pipelines. ## The best alternative to Airlfow? Mage.ai **Free hands-on tutorial:** [Watch on YouTube](https://youtu.be/3gXsFEC3aYA) In this insightful video, Andreas introduces Mage, a promising alternative to Apache Airflow, focusing on its simplicity, user-friendliness, and scalability. The video provides a comprehensive walkthrough of Mage, highlighting its key features and advantages over Airflow. Here's a breakdown of what viewers can learn and expect from the video: 1. **Deployment Ease:** Mage offers a stark contrast to Airflow's complex setup process. It simplifies deployment to a single Docker image, making it straightforward to install and start on any machine, whether it's local or cloud-based on AWS, GCP, or Azure. This simplicity extends to scaling, which Mage handles horizontally, particularly beneficial in Kubernetes environments where performance scales with the number of pipelines. 2. **User Interface (UI):** Mage shines with its UI, presenting a dark mode interface that's not only visually appealing but also simplifies navigation and pipeline management. The UI facilitates easy access to pipelines, scheduling, and monitoring of pipeline runs, offering a more intuitive experience compared to Airflow. 3. **Pipeline Creation and Modification:** Mage streamlines the creation of ETL pipelines, allowing users to easily add data loaders, transformers, and exporters through its UI. It supports direct interaction with APIs for data loading and provides a visual representation of the data flow, enhancing the overall pipeline design experience. 4. **Data Visualization and Exploration:** Beyond simple pipeline creation, Mage enables in-depth data exploration within the UI. Users can generate various charts, such as histograms and bar charts, to analyze the data directly, a feature that greatly enhances the tool's utility. 5. **Testing and Scheduling:** Testing pipelines in Mage is straightforward, allowing for quick integration of tests to ensure data quality and pipeline reliability. Scheduling is also versatile, supporting standard time-based triggers, event-based triggers for real-time data ingestion, and API calls for on-demand pipeline execution. 6. **Support for Streaming and ELT Processes:** Mage is not limited to ETL workflows but also supports streaming and ELT processes. It integrates seamlessly with DBT models for in-warehouse transformations and Spark for big data processing, showcasing its versatility and scalability. 7. **Conclusion and Call to Action:** Andreas concludes by praising the direction in which the industry is moving, with tools like Mage simplifying data engineering processes. He encourages viewers to try Mage and engage with the content by liking, subscribing, and commenting on their current tools and the potential impact of Mage. Overall, the video shows Mage as a highly user-friendly, scalable, and versatile tool for data pipeline creation and management, offering a compelling alternative to traditional tools like Airflow. ================================================ FILE: sections/05-CaseStudies.md ================================================ Case Studies ============ ## Contents - [Data Science @Airbnb](05-CaseStudies.md#data-science-at-Airbnb) - [Data Science @Amazon](05-CaseStudies.md#data-science-at-Amazon) - [Data Science @Baidu](05-CaseStudies.md#data-science-at-Baidu) - [Data Science @Blackrock](05-CaseStudies.md#data-science-at-Blackrock) - [Data Science @BMW](05-CaseStudies.md#data-science-at-BMW) - [Data Science @Booking.com](05-CaseStudies.md#data-science-at-Booking.com) - [Data Science @CERN](05-CaseStudies.md#data-science-at-CERN) - [Data Science @Disney](05-CaseStudies.md#data-science-at-Disney) - [Data Science @DLR](05-CaseStudies.md#data-science-at-DLR) - [Data Science @Drivetribe](05-CaseStudies.md#data-science-at-Drivetribe) - [Data Science @Dropbox](05-CaseStudies.md#data-science-at-Dropbox) - [Data Science @Ebay](05-CaseStudies.md#data-science-at-Ebay) - [Data Science @Expedia](05-CaseStudies.md#data-science-at-Expedia) - [Data Science @Facebook](05-CaseStudies.md#data-science-at-Facebook) - [Data Science @Google](05-CaseStudies.md#data-science-at-Google) - [Data Science @Grammarly](05-CaseStudies.md#data-science-at-Grammarly) - [Data Science @ING Fraud](05-CaseStudies.md#data-science-at-ING-Fraud) - [Data Science @Instagram](05-CaseStudies.md#data-science-at-Instagram) - [Data Science @LinkedIn](05-CaseStudies.md#data-science-at-LinkedIn) - [Data Science @Lyft](05-CaseStudies.md#data-science-at-Lyft) - [Data Science @NASA](05-CaseStudies.md#data-science-at-NASA) - [Data Science @Netflix](05-CaseStudies.md#data-science-at-Netflix) - [Data Science @OLX](05-CaseStudies.md#data-science-at-OLX) - [Data Science @OTTO](05-CaseStudies.md#data-science-at-OTTO) - [Data Science @Paypal](05-CaseStudies.md#data-science-at-Paypal) - [Data Science @Pinterest](05-CaseStudies.md#data-science-at-Pinterest) - [Data Science @Salesforce](05-CaseStudies.md#data-science-at-Salesforce) - [Data Science @Siemens Mindsphere](05-CaseStudies.md#data-science-at-Siemens-Mindsphere) - [Data Science @Slack](05-CaseStudies.md#data-science-at-Slack) - [Data Science @Spotify](05-CaseStudies.md#data-science-at-Spotify) - [Data Science @Symantec](05-CaseStudies.md#data-science-at-Symantec) - [Data Science @Tinder](05-CaseStudies.md#data-science-at-Tinder) - [Data Science @Twitter](05-CaseStudies.md#data-science-at-Twitter) - [Data Science @Uber](05-CaseStudies.md#data-science-at-Uber) - [Data Science @Upwork](05-CaseStudies.md#data-science-at-Upwork) - [Data Science @Woot](05-CaseStudies.md#data-science-at-Woot) - [Data Science @Zalando](05-CaseStudies.md#data-science-at-Zalando) How I do Case Studies --------------------- ### Data Science at Airbnb | Podcast Episode: #063 Data Engineering At Airbnb Case Study |------------------| |How Airbnb is doing data engineering? Let’s check it out. | [Watch on YouTube](https://youtu.be/iokqkMfyIfo) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/063-Data-Engineering-At-Airbnb-Case-Study-e45il2)| **Slides:** Airbnb Engineering Blog: Data Infrastructure: Scaling the serving tier: Druid Analytics: Spark Streaming for logging events: -Druid Wiki: ### Data Science at Amazon ### Data Science at Baidu ### Data Science at Blackrock ### Data Science at BMW ### Data Science at Booking.com | Podcast Episode: #064 Data Engineering at Booking.com Case Study |------------------| |How Booking.com is doing data engineering? Let’s check it out. | [Watch on YouTube](https://youtu.be/9GE3yiVo1FM) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/064-Data-Engineering-At-Booking-com-Case-Study-e45ilg)| **Slides:** Druid: Kafka Architecture: Confluent Platform: ### Data Science at CERN | Podcast Episode: #065 Data Engineering At CERN Case Study |------------------| |How is CERN doing Data Engineering? They must get huge amounts of data from the Large Hadron Collider. Let’s check it out. | [Watch on YouTube](https://youtu.be/LrhfzPsKaDE) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/065-Data-Engineering-At-CERN-Case-Study-e45ime)| **Slides:** ### Data Science at Disney ### Data Science at DLR ### Data Science at Drivetribe ### Data Science at Dropbox ### Data Science at Ebay ### Data Science at Expedia ### Data Science at Facebook ### Data Science at Google \ \ \ ### Data Science at Grammarly ### Data Science at ING Fraud ### Data Science at Instagram ### Data Science at LinkedIn | Podcast Episode: #073 Data Engineering At LinkedIn Case Study |------------------| |Let’s check out how LinkedIn is processing data :) | [Watch on YouTube](https://youtu.be/wgfoE8Jbr_Q) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/073-Data-Engineering-At-LinkedIn-Case-Study-e45is6)| **Slides:** ### Data Science at Lyft ### Data Science at NASA | Podcast Episode: #067 Data Engineering At NASA Case Study |------------------| |A look into how NASA is doing data engineering. | [Watch on YouTube](https://youtu.be/Pctn_1UoNjA) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/067-Data-Engineering-At-NASA-Case-Study-e45ina)| **Slides:** ### Data Science at Netflix | Podcast Episode: #062 Data Engineering At Netflix Case Study |------------------| |How Netflix is doing Data Engineering using their Keystone platform. | [Watch on YouTube](https://youtu.be/YWPsYpjNKeM) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/062-Data-Engineering-At-Netflix-Case-Study-e45ikp)| Netflix revolutionized how we watch movies and TV. Currently over 75 million users watch 125 million hours of Netflix content every day! Netflix's revenue comes from a monthly subscription service. So, the goal for Netflix is to keep you subscribed and to get new subscribers. To achieve this, Netflix is licensing movies from studios as well as creating its own original movies and TV series. But offering new content is not everything. What is also very important is, to keep you watching content that already exists. To be able to recommend you content, Netflix is collecting data from users. And it is collecting a lot. Currently, Netflix analyses about 500 billion user events per day. That results in a stunning 1.3 Petabytes every day. All this data allows Netflix to build recommender systems for you. The recommenders are showing you content that you might like, based on your viewing habits, or what is currently trending. ###### The Netflix batch processing pipeline When Netflix started out, they had a very simple batch processing system architecture. The key components were Chuckwa, a scalable data collection system, Amazon S3 and Elastic MapReduce. ![Old Netflix Batch Processing Pipeline[]{label="fig:Bild1"}](/images/Netflix-Chuckwa-Pipeline.jpg){#fig:Bild1 width="90%"} Chuckwa wrote incoming messages into Hadoop sequence files, stored in Amazon S3. These files then could be analysed by Elastic MapReduce jobs. Netflix batch processing pipeline Jobs were executed regularly on a daily and hourly basis. As a result, Netflix could learn how people used the services every hour or once a day. ###### Know what customers want: Because you are looking at the big picture you can create new products. Netflix uses insight from big data to create new TV shows and movies. They created House of Cards based on data. There is a very interesting TED talk about this you should watch: [How to use data to make a hit TV show \| Sebastian Wernicke](https://www.youtube.com/watch?v=vQILP19qABk) Batch processing also helps Netflix to know the exact episode of a TV show that gets you hooked. Not only globally but for every country where Netflix is available. Check out the article from TheVerge They know exactly what show works in what country and what show does not. It helps them create shows that work in everywhere or select the shows to license in different countries. Germany for instance does not have the full library that Americans have :( We have to put up with only a small portion of TV shows and movies. If you have to select, why not select those that work best. ###### Batch processing is not enough As a data platform for generating insight the Cuckwa pipeline was a good start. It is very important to be able to create hourly and daily aggregated views for user behavior. To this day Netflix is still doing a lot of batch processing jobs. The only problem is: With batch processing you are basically looking into the past. For Netflix, and data driven companies in general, looking into the past is not enough. They want a live view of what is happening. ###### The trending now feature One of the newer Netflix features is "Trending now". To the average user it looks like that "Trending Now" means currently most watched. This is what I get displayed as trending while I am writing this on a Saturday morning at 8:00 in Germany. But it is so much more. What is currently being watched is only a part of the data that is used to generate "Trending Now". ![Netflix Trending Now Feature[]{label="fig:Bild1"}](/images/Netflix-Trending-Now-Screenshot.jpg){#fig:Bild1 width="90%"} "Trending now" is created based on two types of data sources: Play events and Impression events. What messages those two types actually include is not really communicated by Netflix. I did some research on the Netflix Techblog and this is what I found out: Play events include what title you have watched last, where you did stop watching, where you used the 30s rewind and others. Impression events are collected as you browse the Netflix Library like scroll up and down, scroll left or right, click on a movie and so on. Basically, play events log what you do while you are watching. Impression events are capturing what you do on Netflix, while you are not watching something. ###### Netflix real-time streaming architecture Netflix uses three internet facing services to exchange data with the client's browser or mobile app. These services are simple Apache Tomcat based web services. The service for receiving play events is called "Viewing History". Impression events are collected with the "Beacon" service. The "Recommender Service" makes recommendations based on trend data available for clients. Messages from the Beacon and Viewing History services are put into Apache Kafka. It acts as a buffer between the data services and the analytics. Beacon and Viewing History publish messages to Kafka topics. The analytics subscribes to the topics and gets the messages automatically delivered in a first in first out fashion. After the analytics the workflow is straight forward. The trending data is stored in a Cassandra Key-Value store. The recommender service has access to Cassandra and is making the data available to the Netflix client. ![Netflix Streaming Pipeline[]{label="fig:Bild1"}](/images/Netflix-Streaming-Pipeline.jpg){#fig:Bild1 width="90%"} The algorithms how the analytics system is processing all this data is not known to the public. It is a trade secret of Netflix. What is known, is the analytics tool they use. Back in Feb 2015 they wrote in the tech blog that they use a custom made tool. They also stated, that Netflix is going to replace the custom made analytics tool with Apache Spark streaming in the future. My guess is, that they did the switch to Spark some time ago, because their post is more than a year old. ### Data Science at OLX | Podcast Episode: #083 Data Engineering at OLX Case Study |------------------| |This podcast is a case study about OLX with Senior Data Scientist Alexey Grigorev as guest. It was super fun. | [Watch on YouTube](https://youtu.be/H_uFNoCvykM) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/083-Data-Engineering-at-OLX-Case-Study-e45j5n)| **Slides:** ### Data Science at OTTO ### Data Science at Paypal ### Data Science at Pinterest | Podcast Episode: #069 Engineering Culture At Pinterest |------------------| |In this podcast we look into data platform and processing at Pinterest. | [Watch on YouTube](https://youtu.be/cqWXGVoDX8Q) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/069-Data-Engineering-At-Pinterest-Case-Study-e45ioh)| **Slides:** ### Data Science at Salesforce ### Data Science at Siemens Mindsphere | Podcast Episode: #059 What Is The Siemens Mindsphere IoT Platform? |------------------| |The Internet of things is a huge deal. There are many platforms available. But, which one is actually good? Join me on a 50 minute dive into the Siemens Mindsphere online documentation. I have to say I was super unimpressed by what I found. Many limitations, unclear architecture and no pricing available? Not good! | [Watch on YouTube](https://youtu.be/HEd5Tsuy5HE) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/059-A-Look-Into-The-Siemens-Mindsphere-IoT-Platform---059-e45ihn)| ### Data Science at Slack ### Data Science at Spotify | Podcast Episode: #071 Data Engineering At Spotify Case Study |------------------| |In this episode we are looking at data engineering at Spotify, my favorite music streaming service. How do they process all that data? | [Watch on YouTube](https://youtu.be/0WJZ5wtQRWI) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/071-Data-Engineering-At-Spotify-Case-Study-e45iq1)| **Slides:** ### Data Science at Symantec ### Data Science at Tinder ### Data Science at Twitter | Podcast Episode: #072 Data Engineering At Twitter Case Study |------------------| |How is Twitter doing data engineering? Oh man, they have a lot of cool things to share these tweets. | [Watch on YouTube](https://youtu.be/UkqSR3IeLZ8) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/072-Data-Engineering-At-Twitter-Case-Study-e45iqq)| **Slides:** ### Data Science at Uber ### Data Science at Upwork ### Data Science at Woot ### Data Science at Zalando | Podcast Episode: #087 Data Engineering At Zalando Case Study Talk |------------------| |I had a great conversation about data engineering for online retailing with Michal Gancarski and Max Schultze. They showed Zalando’s data platform and how they build data pipelines. Super interesting especially for AWS users. | [Watch on YouTube](https://youtu.be/IXOLsNA6Hm0) Do me a favor and give these guys a follow on LinkedIn: LinkedIn of Michal: LinkedIn of Max: Zalando has a tech blog with more infos and there is also a meetup in Berlin: Zalando Blog: Next Zalando Data Engineering Meetup: Interesting tools: AWS CDK: Delta Lake: AWS Step Functions: [https://aws.amazon.com/step-functions/ AWS State Language: https://states-language.net/spec.html](https://aws.amazon.com/step-functions/ AWS State Language: https://states-language.net/spec.html) Youtube channel of the meetup: [https://www.youtube.com/channel/UCxwul7aBm2LybbpKGbCOYNA/playlists talk at Spark+AI](https://www.youtube.com/channel/UCxwul7aBm2LybbpKGbCOYNA/playlists talk at Spark+AI) Summit about Zalando's Processing Platform: Talk at Strata London slides: ================================================ FILE: sections/06-BestPracticesCloud.md ================================================ Best Practices Cloud Platforms ============================= This section is a collection of best practices on how you can arrange the tools together to a platform. It's here especially to help you start your own project in the cloud on AWS, Azure and GCP. Like the advanced skills section this section also follows my [My Data Science Platform Blueprint](sections/01-Introduction.md#my-big-data-platform-blueprint). In the blueprint I divided the platform into sections: Connect, Buffer, Processing, Store and Visualize. This order will help you learn how to connect the right tools together. Take your time and research the tools and learn how they work. Right now the Azure section has a lot of links to platform examples. They are also useful for AWS and GCP, just try to change out the tools. As always, I am going to add more stuff to this over time. Have fun! ## Contents - [Amazon Web Services (AWS)](06-BestPracticesCloud.md#aws) - [Connect](06-BestPracticesCloud.md#Connect) - [Buffer](06-BestPracticesCloud.md#Buffer) - [Processing](06-BestPracticesCloud.md#Processing) - [Store](06-BestPracticesCloud.md#Store) - [Visualize](06-BestPracticesCloud.md#Visualize) - [Containerization](06-BestPracticesCloud.md#Containerization) - [Best Practices](06-BestPracticesCloud.md#Best-Practices) - [More Details](06-BestPracticesCloud.md#More-Details) - [Microsoft Azure](06-BestPracticesCloud.md#azure) - [Connect](06-BestPracticesCloud.md#Connect-1) - [Buffer](06-BestPracticesCloud.md#Buffer-1) - [Processing](06-BestPracticesCloud.md#Processing-1) - [Store](06-BestPracticesCloud.md#Store-1) - [Visualize](06-BestPracticesCloud.md#Visualize-1) - [Containerization](06-BestPracticesCloud.md#Containerization-1) - [Best Practices](06-BestPracticesCloud.md#Best-Practices-1) - [Google Cloud Platform (GCP)](06-BestPracticesCloud.md#gcp) - [Connect](06-BestPracticesCloud.md#Connect-2) - [Buffer](06-BestPracticesCloud.md#Buffer-2) - [Processing](06-BestPracticesCloud.md#Processing-2) - [Store](06-BestPracticesCloud.md#Store-2) - [Visualize](06-BestPracticesCloud.md#Visualize-2) - [Containerization](06-BestPracticesCloud.md#Containerization-2) - [Best Practices](06-BestPracticesCloud.md#Best-Practices-2) # AWS ## Connect - Elastic Beanstalk (very old) - SES Simple Email Service - API Gateway ## Buffer - Kinesis - Kinesis Data Firehose - Managed Streaming for Kafka (MSK) - MQ - Simple Queue Service (SQS) - Simple Notification Service (SNS) ## Processing - EC2 - Athena - EMR - Elasticsearch - Kinesis Data Analytics - Glue - Step Functions - Fargate - Lambda - SageMaker ## Store - Simple Storage Service (S3) - Redshift - Aurora - RDS - DynamoDB - ElastiCache - Neptune Graph DB - Timestream - DocumentDB (MongoDB compatible) ## Visualize - Quicksight ## Containerization - Elastic Container Service (ECS) - Elastic Container Registry (ECR) - Elastic Kubernetes Service (EKS) ## Best Practices Deploying a Spring Boot Application on AWS Using AWS Elastic Beanstalk: [https://aws.amazon.com/de/blogs/devops/deploying-a-spring-boot-application-on-aws-using-aws-elastic-beanstalk/](https://aws.amazon.com/de/blogs/devops/deploying-a-spring-boot-application-on-aws-using-aws-elastic-beanstalk/) How to deploy a Docker Container on AWS: [https://aws.amazon.com/getting-started/hands-on/deploy-docker-containers/](https://aws.amazon.com/getting-started/hands-on/deploy-docker-containers/) #### AWS platform architecture for GenAI ![Imagetitle](/images/06/genai-enterprise.png) ▶ [Click here to watch](https://youtu.be/2yX6G4ZURbc) I recorded a reaction video to an AWS platform architecture for GenAI called Tailwinds. Presented by John from Innovative Solutions and Josh from AWS, it has two main flows: indexing and consumer. Data enters through S3 buckets or an API gateway, processed by AWS Lambda or Glue, and stored in a vector or graph database, then indexed in OpenSearch. Applications like chatbots use an API gateway to trigger Lambda functions for data retrieval and processing. This flexible serverless setup supports various data formats and uses tools like SAM and Terraform. Amazon Bedrock helps customers choose and evaluate models. The architecture is flexible but requires effort to create the necessary Lambda functions. Check out the video and share your thoughts! ▶ [Click here to watch](https://youtu.be/2yX6G4ZURbc) #### Generative AI enabled job search engine ![Imagetitle](/images/06/job-search.png) ▶ [Click here to watch](https://youtu.be/dOWqasmqfHQ) Hey everyone, I recorded a reaction video to an AWS platform architecture for a Gen AI job search engine. Presented by Andrea from AWS and Bill from Healthy Careers, this setup uses generative AI to enhance job searches for healthcare professionals. The architecture uses Elastic Container Service (ECS) to handle user queries, processed by Claude II for prompt checks and geolocation. Cleaned prompts are vectorized using Amazon's Titan model, with user search history fetched from an SQL database. Search results are stored in Elasticsearch, updating every six hours. Finally, Claude II generates a response from the search results and sends it back to the user. I found the use of Claude II for prompt sanitization and geolocation, and the integration of multiple AI models through AWS Bedrock, particularly interesting. This setup keeps data private and provides a flexible, efficient job search experience. Check out the video and share your thoughts! #### Voice transcription and analysis on AWS ![Imagetitle](/images/06/voice-transcription.png) ▶ [Click here to watch](https://youtu.be/RGXRjOTQuBM) Hey everyone, I recorded a reaction video to an AWS architecture for voice transcription and analysis. Presented by Nuan from AWS and Ben from Assembly AI, this system is designed to handle large-scale audio data processing. Users upload audio data via an API to an ECS container. The data is then managed by an orchestrator that decides which models to use and in what order. The orchestrator sends tasks to SQS, which triggers various ML models running on ECS. These models handle tasks like speech-to-text conversion, sentiment analysis, and speaker labeling. Results are stored in S3 and users are notified via SNS and a Lambda function when processing is complete. I found the use of ECS for containerized applications and the flexibility of swapping models through ECR particularly interesting. This architecture ensures scalability and efficiency, making it ideal for handling millions of requests per day. Check out the video and share your thoughts! #### GeoSpatial Data Analysis ![Imagetitle](/images/06/geo-spacial.png) ▶ [Click here to watch](https://youtu.be/MxVJAvFSTXg) Hey everyone, I recorded a reaction video to an AWS architecture for geospatial data analysis by TCS. Presented by David John and Suryakant from TCS, this platform is used in next-gen agriculture for tasks like crop health, yield, and soil moisture analysis. The platform uses data from satellites, AWS open data, and field agents, processing it with Lambda, Sagemaker, and PostgreSQL. Data is stored and analyzed in S3 buckets and PostgreSQL, with results made accessible via EKS-deployed UIs on EC2 instances, buffered through CloudFront for efficiency. Key aspects include: - Lambda functions triggering Sagemaker jobs for machine learning. - Sagemaker handling extensive processing tasks. - PostgreSQL and S3 for storing processed data. - CloudFront caching data to enhance user experience. - I found the use of parallel Sagemaker jobs for scalability and the integration of open data for cost efficiency particularly interesting. This setup effectively meets the agricultural sector's data analysis needs. Check out the video and share your thoughts! #### Building a Self-Service Enterprise Data Engineering Platform ![Imagetitle](/images/06/enterprise-solution.png) ▶ [Click here to watch](https://youtu.be/E9JFCl7bk88) Hey everyone, I recorded a reaction video to an AWS architecture for a self-service enterprise data engineering platform by ZS Associates. Presented by David John and Laken from ZS Associates, this platform is designed to streamline data integration, infrastructure provisioning, and data access for life sciences companies. Key components: - **Users and Interaction**: Data engineers and analysts interact through a self-service web portal, selecting infrastructure types and providing project details. This portal makes REST requests to EKS, which creates records in PostgreSQL and triggers infrastructure provisioning via SQS. - **Infrastructure Provisioning**: EKS processes SQS messages to provision infrastructure such as EMR clusters, databases in Glue Catalog, S3 buckets, and EC2 instances with containerized services like Airflow or NiFi. IAM roles are configured for access control. - **Data Governance and Security**: All data sets are accessed through the Glue Catalog, with governance workflows requiring approval from data owners via SES notifications. EKS updates IAM roles and Ranger policies for fine-grained access control. - **Scalability and Efficiency**: EKS hosts 100+ microservices supporting workflows and UI portals. The platform handles millions of API requests and hundreds of data access requests monthly, with auto-scaling capabilities to manage costs. This architecture effectively reduces time to market, enhances security at scale, and optimizes costs by automating data access and infrastructure provisioning. It also ensures data governance and security through controlled access and approval processes. Check out the video and share your thoughts! #### Customer Support Platform ![Imagetitle](/images/06/customer-support.png) ▶ [Click here to watch](https://youtu.be/sCIFpOuryFU) Hey everyone, I recorded a reaction video to an AWS architecture for a personalized customer support platform by Traeger. Presented by David John and Lizzy from Traeger, this system enhances customer support by leveraging data from Shopify, EventBridge, Kinesis Data Firehose, S3, Lambda, DynamoDB, and Amazon Connect. Key components: - **Order Processing**: Customer order data from Shopify flows into EventBridge, then to Kinesis Data Firehose, which writes it to S3. An event trigger in S3 invokes a Lambda function that stores specific order metadata in DynamoDB. - **Personalized Customer Support**: When a customer calls, Amazon Connect uses Pinpoint to determine the call's origin, personalizing the language options. Connect triggers a Lambda function to query DynamoDB for customer metadata based on the phone number. This data is used to inform the customer support agent. - **Reason for Contact**: Amazon Lex bot asks the customer the reason for their call, and this information, along with customer metadata, routes the call to a specialized support queue. I found the use of DynamoDB for storing customer metadata and the integration with Amazon Connect and Lex for personalized support particularly interesting. The architecture is scalable and ensures a personalized experience for customers. Check out the video and share your thoughts! #### League of Legends Data Platform on AWS ![Imagetitle](/images/06/league.jpg) ▶ [Click here to watch](https://youtu.be/FX_ZUJk_WoE) Hey everyone, I recorded a reaction video to an AWS architecture for the data platform that powers League of Legends by Riot Games. Presented by David John and the team at Riot Games, this system handles massive amounts of data generated by millions of players worldwide. Key components: - **Player Interaction**: Players connect to game servers globally. The game client communicates with an API running in EKS. This setup ensures low latency and optimal performance. - **Data Ingestion**: The game client and server send data about player interactions to EKS, which flows into MSK (Managed Streaming for Kafka). Local Kafka clusters buffer the data before it’s replicated to regional MSK clusters using MirrorMaker. - **Data Processing**: Spark Streaming jobs process the data from MSK and store it in Delta Lake on S3. This setup ensures efficient data handling and reduces latency in data availability. - **Data Storage and Access**: Glue serves as the data catalog, managing metadata and permissions. Data consumers, including analysts, designers, engineers, and executives, access this data through Databricks, leveraging Glue for structured queries. I found the use of MSK and Spark for scalable data ingestion and processing particularly interesting. This architecture supports real-time analytics, allowing Riot Games to quickly assess the impact of new patches and gameplay changes. Check out the video and share your thoughts! #### Platform Connecting 70 Million Cars ![Imagetitle](/images/06/70m-cars.png) ▶ [Click here to watch](https://youtu.be/1nifzmvOGHs) Hey everyone, I recorded a reaction video to an AWS architecture for a connected car platform by Mobileye. Presented by David John and the team at Mobileye, this system connects 70 million cars, collecting and processing data to offer digital services and fleet analysis. Key components: - **Data Collection**: Cars collect anonymized data using sensors and visual inspections, sending it to a REST API and storing it in S3. - **Data Processing**: The data is pulled from S3 into SQS and processed by EKS workers, which scale according to the queue size. Processed data is stored back in S3 and further analyzed using step functions and Lambda for tasks like extracting construction zones and clustering observations. - **Data Storage**: Processed data is stored in S3, Elasticsearch, and CockroachDB. Elasticsearch handles document-based data with self-indexing, while CockroachDB supports frequent updates. - **Data Consumption**: EKS hosts a secured REST API and web application, allowing customers like city planners to access insights on pedestrian and bicycle traffic. Future plans include enabling cloud image processing on EKS with GPU instances and focusing on cost reduction as data flow increases. I found the use of EKS for scalable data processing and the combination of Elasticsearch and CockroachDB for different data needs particularly interesting. This architecture efficiently handles large-scale data from millions of connected cars. Check out the video and share your thoughts! #### 55TB A Day: Nielsen AWS Data Architecture ![Imagetitle](/images/06/55-tb.png) ▶ [Click here to watch](https://youtu.be/WCQe1VP_q5A) Hey everyone, I recorded a reaction video to an AWS architecture for Nielsen Marketing Cloud, which processes 55TB of data daily. Presented by David John, this system handles marketing segmentation data for campaigns. Key components: - **Data Ingestion**: Marketing data comes in files, written to S3. Spark on EMR processes and transforms the data, writing the output to another S3 bucket. - **Data Processing**: Lambda functions handle the final formatting and upload the data to over 100 ad networks. Metadata about file processing is managed in a PostgreSQL RDS database. - **Metadata Management**: A work manager Lambda reads metadata from RDS, triggers processing jobs in EMR, and updates the metadata post-processing. - **Scaling and Rate Limiting**: The serverless architecture allows automatic scaling. However, rate limiting is implemented to prevent overloading ad networks, ensuring they handle data bursts smoothly. Challenges and Solutions: - **Scale**: The system handles 250 billion events per day, scaling up and down automatically to manage peak loads. - **Rate Limiting**: To avoid overwhelming ad networks, a rate-limiting mechanism was introduced, managing data flow based on network capacity. - **Back Pressure Management**: SQS is used to buffer Lambda responses, preventing direct overload on the PostgreSQL database. I found the use of SQS for metadata management and the serverless architecture for handling massive data loads particularly interesting. This setup ensures efficient data processing and smooth delivery to ad networks. Check out the video and share your thoughts! #### Orange Theory Fitness ![Image](/images/06/fitness-1.jpeg) ▶ [Click here to watch](https://youtu.be/ssaXRo5s1r4) Hey, everybody! Today, I'm reacting to the AWS data infrastructure at Orange Theory Fitness, where they collect data from wristbands and training machines. Let's dive in and see how they manage it all. ### Key Components 1. **Local Server**: Aggregates data from in-studio equipment and mobile apps, ensuring resiliency if the cloud connection is lost. 2. **API Gateway and Cognito**: Handle authentication and route data to the cloud. 3. **Lambda Functions**: Process data. 4. **Aurora RDS (MySQL)**: Stores structured data like member profiles, class bookings, and studio information. 5. **DynamoDB**: Stores performance metrics and workout statistics for quick access. 6. **S3**: Serves as a data lake, storing telemetry data. 7. **Kinesis Firehose**: Streams telemetry data to S3. ### Challenges & Solutions 1. **Resiliency** - **Challenge**: Ensure operations continue if cloud connection is lost. - **Solution**: Local server aggregates data and syncs with the cloud once the connection is restored. 2. **Data Integration** - **Challenge**: Integrate data from various sources. - **Solution**: Use API Gateway and Cognito for unified authentication and data routing. 3. **Data Processing** - **Challenge**: Efficiently process and store different types of data. - **Solution**: Use Lambda for processing, Aurora RDS for structured data, DynamoDB for quick access to performance metrics, and Kinesis Firehose with S3 for streaming and storing large volumes of telemetry data. This architecture leverages AWS tools for scalability, flexibility, and resilience, making it an excellent example of a well-thought-out data infrastructure for a fitness application. Let me know your thoughts in the comments. What do you think of this architecture? Would you have done anything differently? If you have any questions, feel free to ask. And if you're interested in learning more about data engineering, check out my academy at learndataengineering.com. See you in the next video! ## More Details AWS Whitepapers: [https://d1.awsstatic.com/whitepapers/aws-overview.pdf](https://d1.awsstatic.com/whitepapers/aws-overview.pdf) # Azure ## Connect - Event Hub - IoT Hub ## Buffer - Data Factory - Event Hub - RedisCache (also Store) ## Processing - Stream Analytics Service - Azure Databricks - Machine Learning - Azure Functions - Azure HDInsight (Hadoop PaaS) ## Store - Blob - CosmosDB - MariaDB - MySQL - PostgreSQL - SQL - Azure Data lake - Azure Storage (SQL Table?) - Azure Synapse Analytics ## Visualize - PowerBI ## Containerization - Virtual Machines - Virtual Machine Scale Sets - Azure Container Service (AKS) - Container Instances - Azure Kubernetes Service ## Best Practices Advanced Analytics Architecture: [https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/advanced-analytics-on-big-data](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/advanced-analytics-on-big-data) Anomaly Detection in Real-time Data Streams: [https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/anomaly-detection-in-real-time-data-streams](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/anomaly-detection-in-real-time-data-streams) Modern Data Warehouse Architecture: [https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/modern-data-warehouse](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/modern-data-warehouse) CI/CD for Containers: [https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/cicd-for-containers](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/cicd-for-containers) Real Time Analytics on Big Data Architecture: [https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/real-time-analytics](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/real-time-analytics) Anomaly Detection in Real-time Data Streams: [https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/anomaly-detection-in-real-time-data-streams](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/anomaly-detection-in-real-time-data-streams) IoT Architecture – Azure IoT Subsystems: [https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/azure-iot-subsystems](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/azure-iot-subsystems) Tier Applications & Data for Analytics: [https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/tiered-data-for-analytics](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/tiered-data-for-analytics) Extract, transform, and load (ETL) using HDInsight: [https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/extract-transform-and-load-using-hdinsight](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/extract-transform-and-load-using-hdinsight) IoT using Cosmos DB: [https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/iot-using-cosmos-db](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/iot-using-cosmos-db) Streaming using HDInsight: [https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/streaming-using-hdinsight](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/streaming-using-hdinsight) # GCP ## Connect - Cloud IoT Core - App Engine - Cloud Dataflow ## Buffer - Pub/Sub ## Processing - Compute Engine - Cloud Functions - Specialized tools: - Cloud Dataflow - Cloud Dataproc - Cloud Datalab - Cloud Dataprep - Cloud Composer - App Engine ## Store - Cloud Storage - Cloud SQL - Cloud Spanner - Cloud Datastore - Cloud BigTable - Cloud Storage - Cloud Memorystore - BigQuery ## Visualize ## Containerization - Kubernetes Engine - Container Security ## Best Practices Thanks to Ismail Holoubi for the following GCP links Best practices for migrating virtual machines to Compute Engine: https://cloud.google.com/solutions/best-practices-migrating-vm-to-compute-engine Best practices for Cloud Storage: https://cloud.google.com/storage/docs/best-practices Moving a publishing workflow to BigQuery for new data insights: https://cloud.google.com/blog/products/data-analytics/moving-a-publishing-workflow-to-bigquery-for-new-data-insights Architecture: Optimizing large-scale ingestion of analytics events and logs: https://cloud.google.com/solutions/architecture/optimized-large-scale-analytics-ingestion Choosing the right architecture for global data distribution: https://cloud.google.com/solutions/architecture/global-data-distribution Best Practices for Operating Containers: https://cloud.google.com/solutions/best-practices-for-operating-containers Automating IoT Machine Learning: Bridging Cloud and Device Benefits with AI Platform: https://cloud.google.com/solutions/automating-iot-machine-learning ================================================ FILE: sections/07-DataSources.md ================================================ 100 Plus Data Sources Data Science =================================== This is a section with links to data sources. During my data engineer coaching we need to find good data sets to work with. So, I started this section to make it easier to find good sources. I've taken these links from articles and blog posts. Why not only link the articles? You know, these posts can go away at any time. I want to keep the links to the platforms either way. I haven't had the chance to check each link myself. Please let me know if something isn't right. You can find the articles on the bottom of this section to read more. They include even more data sources I haven't had time to add to this list. ## Contents - [Student Favorites](07-DataSources.md#Student-Favorites) - [Content Marketing](07-DataSources.md#Content-Marketing) - [Crime](07-DataSources.md#Crime) - [Drugs](07-DataSources.md#Drugs) - [Education](07-DataSources.md#Education) - [Entertainment](07-DataSources.md#Entertainment) - [Environmental And Weather Data](07-DataSources.md#Environmental-And-Weather-Data) - [Financial And Economic Data](07-DataSources.md#Financial-And-Economic-Data]) - [General And Academic](07-DataSources.md#General-And-Academic) - [Government And World](07-DataSources.md#Government-And-World) - [Health](07-DataSources.md#Health) - [Human Rights](07-DataSources.md#Human-Rights) - [Labor And Employment Data](07-DataSources.md#Labor-And-Employment-Data) - [Politics](07-DataSources.md#Politics) - [Retail](07-DataSources.md#Retail) - [Social](07-DataSources.md#Social) - [Source Articles and Blog Posts](07-DataSources.md#Source-Articles-and-Blog-Posts) - [Travel And Transportation](07-DataSources.md#Travel-And-Transportation) - [Various Portals](07-DataSources.md#Various-Portals) ## Student Favorites In my Coaching program my students learn by doing a project. And the foundation of every project is selecting a dataset. That can be an API or a file source, depending a lot on the student's goals and interests. Working out goals for the dataset, figuring out the data modeling, creating the architecture and building it. It's a fun way to learn and get better at Data Engineering. Here's a list of my student's favorite datasets and APIs Learn more about the Coaching program: [click here](https://learndataengineering.com/p/data-engineering-coaching) ### Datasets - [Fraud detection](https://www.kaggle.com/datasets/kartik2112/fraud-detection) - [Industrial equipment monitoring](https://www.kaggle.com/datasets/dnkumars/industrial-equipment-monitoring-dataset) - [Energy demand & generation](https://www.kaggle.com/datasets/nicholasjhana/energy-consumption-generation-prices-and-weather?select=weather_features.csv) - [Online Retail](https://www.kaggle.com/datasets/tunguz/online-retail) - [Brazilian E-commerce](https://www.kaggle.com/datasets/olistbr/brazilian-ecommerce) - [Beijing Air Quality](https://www.kaggle.com/datasets/sid321axn/beijing-multisite-airquality-data-set) - [NYC Taxi](https://www.kaggle.com/datasets/diishasiing/revenue-for-cab-drivers) ### APIs - [Bike sharing Bluebikes](https://bluebikes.com/system-data) - [Bike sharing Divvy Bikes](https://divvybikes.com/system-data) - [Weather API](https://www.weatherapi.com/docs/) - [Bluesky API](https://docs.bsky.app/docs/advanced-guides/api-directory) - [Guardian news API](https://open-platform.theguardian.com/) - [Football API](https://www.api-football.com/) ## General And Academic - [Amazon Public Data Sets](https://registry.opendata.aws/) - [Datasets Subreddit](https://www.reddit.com/r/datasets) - [Enigma Public](https://public.enigma.com/) - [FiveThirtyEight](http://fivethirtyeight.com/) - [Google Scholar](http://scholar.google.com/) - [Pew Research](http://www.pewresearch.org/) - [The Upshot by New York Times](http://www.nytimes.com/section/upshot) - [UNData](http://data.un.org/) ## Content Marketing - [Buffer](https://blog.bufferapp.com/) - [Content Marketing Institute](http://contentmarketinginstitute.com/about/) - [HubSpot](http://www.hubspot.com/marketing-statistics) - [Moz](https://moz.com/blog) ## Crime - [Bureau of Justice Statistics](http://www.bjs.gov/index.cfm?ty=dca) - [FBI Crime Statistics](https://www.fbi.gov/stats-services/crimestats) - [National Archive of Criminal Justice Data](https://www.icpsr.umich.edu/icpsrweb/NACJD/) - [Uniform Crime Reporting Statistics](https://crime-data-explorer.fr.cloud.gov/) ## Drugs - [Drug Data and Database by First Databank](http://www.fdbhealth.com/) - [Drug War Facts](http://www.drugwarfacts.org/) - [National Institute on Drug Abuse](https://www.drugabuse.gov/related-topics/trends-statistics) - [U.S. Food and Drug Administration](http://www.fda.gov/Drugs/InformationOnDrugs/ucm079750.htm) - [United Nations Office on Drugs and Crime](https://www.unodc.org/unodc/en/data-and-analysis/) ## Education - [Education Data by the World Bank](http://data.worldbank.org/topic/education) - [Education Data by Unicef](http://data.unicef.org/education/overview.html) - [National Center for Education Statistics](https://nces.ed.gov/) ## Entertainment - [Academic Rights Press](http://www.academicrightspress.com/entertainment/music) - [BFI Film Forever](http://www.bfi.org.uk/education-research/film-industry-statistics-research) - [BLS: Arts, Entertainment, and Recreation](http://www.bls.gov/iag/tgs/iag71.htm) - [IFPI](http://www.ifpi.org/global-statistics.php) - [Million Song Dataset](https://aws.amazon.com/datasets/million-song-dataset/) - [Statista: Film Industry](http://www.statista.com/topics/964/film/) - [Statista: Music Industry](http://www.statista.com/topics/1639/music/) - [Statista: Video Game Industry](http://www.statista.com/topics/868/video-games/) - [The Numbers](http://www.the-numbers.com/) ## Environmental And Weather Data - [Environmental Protection Agency](https://www.epa.gov/data) - [International Energy Agency Atlas](https://www.iea.org/data-and-statistics?country=WORLD&fuel=Energy%20supply&indicator=TPESbySource) - [National Center for Environmental Health](http://www.cdc.gov/nceh/data.htm) - [National Climatic Data Center](http://www.ncdc.noaa.gov/data-access/quick-links#loc-clim) - [National Weather Service](http://www.weather.gov/help-past-weather) - [Weather Underground](https://www.wunderground.com/) - [WeatherBase](http://www.weatherbase.com/) ## Financial And Economic Data - [Federal Reserve Economic Database](https://fred.stlouisfed.org/) - [Financial Data Finder at OSU](./) - Missing link. - [Global Financial Data](https://www.globalfinancialdata.com/index.html) - [Google Finance](https://www.google.com/finance) - [Google Public Data Explorer](http://www.google.com/publicdata/directory) - [IMF Economic Data](https://data.imf.org/?sk=388dfa60-1d26-4ade-b505-a05a558d9a42) - [National Bureau of Economic Research](http://www.nber.org/data/) - [OpenCorporates](https://opencorporates.com/) - [The Atlas of Economic Complexity](http://atlas.cid.harvard.edu/) - [U.S. Bureau of Economic Analysis](http://www.bea.gov/) - [U.S. Securities and Exchange Commission](https://www.sec.gov/dera/data/financial-statement-data-sets.html) - [UN Comtrade Database](https://comtrade.un.org/labs/) - [Visualizing Economics](http://visualizingeconomics.com/) - [World Bank Doing Business Database](http://www.doingbusiness.org/rankings) - [World Bank Open Data](http://data.worldbank.org/) ## Government And World - [Data.gov](http://www.data.gov/) - [European Union Open Data Portal](http://data.europa.eu/euodp/en/data/) - [Gapminder](https://www.gapminder.org/data/) - [Land Matrix (Transnational Land Database)](http://landmatrix.org/en/) - [OECD Aid Database](http://www.oecd.org/dac/financing-sustainable-development/development-finance-data/) - [Open Data Network](http://www.opendatanetwork.com/) - [The CIA World Factbook](https://www.cia.gov/the-world-factbook/) - [The World Bank’s World Development Indicators](http://data.worldbank.org/data-catalog/world-development-indicators) - [U.S. Census Bureau](http://www.census.gov/) - [UNDP’s Human Development Index](http://hdr.undp.org/en/data) ## Health - [America’s Health Rankings](http://www.americashealthrankings.org/) - [Centers for Disease Control and Prevention](http://www.cdc.gov/datastatistics/) - [Health & Social Care Information Centre](http://www.hscic.gov.uk/home) - [Health Services Research Information Central](https://www.nlm.nih.gov/hsrinfo/datasites.html) - [HealthData.gov](https://www.healthdata.gov/) - [Medicare Hospital Quality](https://data.medicare.gov/data/hospital-compare#) - [MedicinePlus](https://www.nlm.nih.gov/medlineplus/healthstatistics.html) - [National Center for Health Statistics](http://www.cdc.gov/nchs/) - [SEER Cancer Incidence](http://seer.cancer.gov/faststats/selections.php?series=cancer) - [World Health Organization](http://www.who.int/en/) ## Human Rights - [Amnesty International](https://www.amnesty.org/en/search/?q=&documentType=Annual+Report) - [Human Rights Data Analysis Group](https://hrdag.org/) - [The Armed Conflict Database by Uppsala University](http://www.pcr.uu.se/research/UCDP/) ## Labor And Employment Data - [Bureau of Labor Statistics](http://www.bls.gov/) - [Department of Labor](https://www.dol.gov/general/topic/statistics/employment) - [Employment by U.S. Census](http://www.census.gov/topics/employment.html) - [U.S. Small Business Administration](https://www.sba.gov/starting-business/how-start-business/business-data-statistics/employment-statistics) ## Politics - [California Field Poll](http://dlab.berkeley.edu/data-resources/california-polls) - [Crowdpac](https://www.crowdpac.com/) - [Gallup](http://www.gallup.com/home.aspx) - [Open Secrets](https://www.opensecrets.org/) - [Rand State Statistics](http://www.randstatestats.org/us/) - [Real Clear Politics](http://guides.lib.berkeley.edu/Intro-to-Political-Science-Research/Stats) - [Roper Center for Public Opinion Research](https://ropercenter.cornell.edu/) - [US Voter Files](http://voterlist.electproject.org/) Note only some states are free, and most do not allow voter files to be used for commercial purposes - this map allows you to see the rules/cost for each state. ## Retail - [Love the Sales](https://www.lovethesales.com/press/data-request) ## Social - [Facebook Graph API](https://developers.facebook.com/docs/graph-api) - [Google Trends](http://www.google.com/trends/explore) - [SocialMention](./) - Missing link. ## Travel And Transportation - [Bureau of Transportation Statistics](https://www.bts.gov/browse-statistical-products-and-data) - [Monthly Tourism Statistics – U.S. Travelers Overseas](http://travel.trade.gov/research/monthly/departures/) - [Search the World](http://www.geoba.se/) - [SkiftStats](https://skift.com/skiftx/skiftstats/) - [U.S. Travel Association](https://www.ustravel.org/research) ## Various Portals - [Ckan](https://ckan.org/) - [Dataverse](https://dataverse.org/) - [DBpedia](https://wiki.dbpedia.org/) - [freeCodeCamp Open Data](https://github.com/freeCodeCamp/open-data) - [Kaggle](https://www.kaggle.com/datasets) - [LODUM](https://lodum.de/) - [Open Data Ipact Map](http://opendataimpactmap.org/) - [Open Data Kit](https://opendatakit.org/) - [Open Data Monitor](https://opendatamonitor.eu/frontend/web/index.php?r=dashboard%2Findex) - [Plenar.io](http://plenar.io/) - [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php) - [Yelp Open Datasets](https://www.yelp.com/dataset) ## Source Articles and Blog Posts - [100+ of the Best Free Data Sources For Your Next Project](https://www.columnfivemedia.com/100-best-free-data-sources-infographic) - [15 Great Free Data Sources for 2016](https://medium.com/@Infogram/15-great-free-data-sources-for-2016-25cb455db257) - [20 Awesome Sources of Free Data](https://www.searchenginejournal.com/free-data-sources/302601/#close) - [30+ Free Data Sources Every Company Should Be Aware Of](https://www.bernardmarr.com/default.asp?contentID=960) - [50 Amazing Free Data Sources You Should Know](https://infogram.com/blog/free-data-sources/) - [50 Best Open Data Sources Ready to be Used Right Now](https://learn.g2.com/open-data-sources) - [70 Amazing Free Data Sources You Should Know](https://www.kdnuggets.com/2017/12/big-data-free-sources.html) - [Big Data: 33 Brilliant And Free Data Sources Anyone Can Use](https://www.forbes.com/sites/bernardmarr/2016/02/12/big-data-35-brilliant-and-free-data-sources-for-2016/#527557ffb54d) - [These Are The Best Free Open Data Sources Anyone Can Use](https://www.freecodecamp.org/news/https-medium-freecodecamp-org-best-free-open-data-sources-anyone-can-use-a65b514b0f2d/) ================================================ FILE: sections/08-InterviewQuestions.md ================================================ 1001 Data Engineering Interview Questions ========================================= Hey everyone, this collection of questions and answers is a work in progress. I'm going to keep adding Q&As, but you are invited to collaborate through [GitHub](https://github.com/andkret/Cookbook): - Eiter clone this repo, make your changes and create a pull request - or raise an issue on GitHub with your questions and answers and we'll add them Andreas ## Contents: - [Python](10-InterviewQuestions.md#python) - [SQL](10-InterviewQuestions.md#sql) - [Integrate](10-InterviewQuestions.md#integrate) - [APIs](10-InterviewQuestions.md#apis) - [Message Queues](10-InterviewQuestions.md#message-queues) - [Distributed Message Queues](10-InterviewQuestions.md#distributed-message-queues) - [Message Queues (Fifo)](10-InterviewQuestions.md#integrate) - [Caches](10-InterviewQuestions.md#caches) - [Data Processing](10-InterviewQuestions.md#data-processing) - [ETL](10-InterviewQuestions.md#etl) - [Stream Processing](10-InterviewQuestions.md#stream-processing) - [Batch Processing](10-InterviewQuestions.md#batch-processing) - [Processing Frameworks](10-InterviewQuestions.md#processing-frameworks) - [Serverless](10-InterviewQuestions.md#serverless) - [Distributed Processing Frameworks](10-InterviewQuestions.md#distributed-processing-frameworks) - [Scheduling](10-InterviewQuestions.md#scheduling) - [Airflow](10-InterviewQuestions.md#airflow) - [CI-CD](10-InterviewQuestions.md#ci-cd) - [Docker](10-InterviewQuestions.md#docker) - [Kubernetes](10-InterviewQuestions.md#kubernetes) - [Data Storage](10-InterviewQuestions.md#data-storage) - [Relational Databases](10-InterviewQuestions.md#relational-databases) - [NoSQL](10-InterviewQuestions.md#nosql) - [Analytical Stores](10-InterviewQuestions.md#analytical-stores) - [Relational Modeling](10-InterviewQuestions.md#relational-modeling) - [Dimensional Data Modeling](10-InterviewQuestions.md#dimensional-modeling) - [Data Lakes](10-InterviewQuestions.md#data-lakes) - [Data Platforms](10-InterviewQuestions.md#data-platforms) - [AWS](10-InterviewQuestions.md#aws) - [Azure](10-InterviewQuestions.md#azure) - [GCP](10-InterviewQuestions.md#gcp) - [Snowflake](10-InterviewQuestions.md#snowflake) ### Python 1. **What is Apache Spark, and how can you use it with Python?** - **Answer**: Apache Spark is a distributed data processing framework that allows for big data processing with in-memory computing capabilities. You can use it with Python through PySpark, which provides a Python API for Spark. PySpark enables data engineers to write Spark applications in Python. 2. **How do you perform data cleaning in Python?** - **Answer**: Data cleaning in Python can be performed using the `pandas` library. Common tasks include handling missing values (`dropna`, `fillna`), removing duplicates (`drop_duplicates`), converting data types, normalizing data, and handling outliers. Example: ```python import pandas as pd df = pd.read_csv('data.csv') df.dropna(inplace=True) # Remove rows with missing values df['column'] = df['column'].astype(int) # Convert column to integer type ``` 3. **Explain how you would optimize a slow-running SQL query within a Python ETL pipeline.** - **Answer**: To optimize a slow-running SQL query, you can: - Analyze the query execution plan. - Add appropriate indexes. - Optimize the query by reducing complexity, such as using JOINs efficiently and avoiding unnecessary subqueries. - Partition large tables if applicable. - Use caching and materialized views for frequently accessed data. - Ensure that statistics are up to date. Example with SQLAlchemy: ```python from sqlalchemy import create_engine engine = create_engine('postgresql://user:password@localhost/dbname') with engine.connect() as connection: result = connection.execute('SELECT * FROM table WHERE condition') data = result.fetchall() ``` 4. **What is the role of a workflow scheduler in data engineering, and can you name some common ones?** - **Answer**: A workflow scheduler automates and manages the execution of ETL jobs and data pipelines. It ensures tasks are executed in the correct order and handles retries, dependencies, and monitoring. Common workflow schedulers include Apache Airflow, Luigi, Prefect, and Apache NiFi. 5. **How do you handle schema changes in a data pipeline?** - **Answer**: Handling schema changes in a data pipeline involves: - Implementing schema evolution techniques. - Using tools like Apache Avro, which supports schema evolution. - Versioning schemas and ensuring backward compatibility. - Monitoring and validating incoming data against the schema. - Applying transformations to adapt to new schemas. Example with Avro: ```python from avro.datafile import DataFileReader from avro.io import DatumReader reader = DataFileReader(open("data.avro", "rb"), DatumReader()) for record in reader: print(record) reader.close() ``` 6. **What is data partitioning, and why is it important in data engineering?** - **Answer**: Data partitioning is the process of dividing a large dataset into smaller, more manageable pieces, often based on a key such as date, user ID, or geographic location. Partitioning improves query performance by reducing the amount of data scanned and allows for parallel processing. It also helps in managing large datasets and reducing I/O costs. 7. **How do you ensure data quality in your pipelines?** - **Answer**: Ensuring data quality involves: - Implementing data validation checks (e.g., constraints, data type checks). - Monitoring for data anomalies and inconsistencies. - Using data profiling tools to understand the data. - Creating unit tests for data processing logic. - Automating data quality checks and alerting mechanisms. Example with `pandas` for data validation: ```python import pandas as pd df = pd.read_csv('data.csv') assert df['column'].notnull().all(), "Missing values found in column" assert (df['age'] >= 0).all(), "Negative ages found" ``` 8. **What is the difference between batch processing and stream processing?** - **Answer**: Batch processing involves processing large volumes of data at once, usually at scheduled intervals. It is suitable for tasks that are not time-sensitive. Stream processing, on the other hand, involves processing data in real-time as it arrives, which is suitable for time-sensitive applications such as real-time analytics, monitoring, and alerts. 9. **How do you implement logging and monitoring in your data pipelines?** - **Answer**: Logging and monitoring can be implemented using: - Logging libraries like Python's `logging` module to capture and store logs. - Monitoring tools like Prometheus, Grafana, or ELK Stack (Elasticsearch, Logstash, Kibana) to visualize and monitor logs. - Setting up alerts for failures or anomalies. Example with Python's `logging` module: ```python import logging logging.basicConfig(filename='pipeline.log', level=logging.INFO) logging.info('This is an informational message') logging.error('This is an error message') ``` 10. **What are some common challenges you face with distributed data processing, and how do you address them?** - **Answer**: Common challenges with distributed data processing include data consistency, fault tolerance, data shuffling, and latency. To address these: - Use distributed processing frameworks like Apache Spark, which handle many of these issues internally. - Implement robust error handling and retries. - Optimize data shuffling by partitioning data effectively. - Use caching mechanisms to reduce latency. - Ensure proper resource allocation and scaling to handle large data volumes. ## SQL ## Integrate ### APIs These questions cover a range of topics related to APIs, including their concepts, security, best practices, and specific implementation details. 1. **What is an API and how does it work?** - **Answer**: An API (Application Programming Interface) is a set of rules and protocols for building and interacting with software applications. It allows different software systems to communicate with each other. APIs define the methods and data formats that applications can use to request and exchange data. 2. **What are the different types of APIs?** - **Answer**: The main types of APIs include: - **Open APIs (Public APIs)**: Available to developers and other users with minimal restrictions. - **Internal APIs (Private APIs)**: Used within an organization to connect systems and data internally. - **Partner APIs**: Shared with specific business partners and offer more control over how data is exposed. - **Composite APIs**: Combine multiple API requests into a single call, allowing multiple data or service requests in one API call. 3. **What is REST and how does it differ from SOAP?** - **Answer**: REST (Representational State Transfer) and SOAP (Simple Object Access Protocol) are two different approaches to building APIs. REST uses standard HTTP methods (GET, POST, PUT, DELETE) and is stateless, meaning each request from a client to a server must contain all the information needed to understand and process the request. SOAP, on the other hand, is a protocol that relies on XML-based messaging and includes built-in rules for security and transactions. 4. **Explain the concept of RESTful services.** - **Answer**: RESTful services are web services that follow the principles of REST. These principles include: - **Statelessness**: Each request from a client must contain all the information needed by the server to process the request. - **Client-Server Architecture**: The client and server are separate entities, and they communicate over a network via standard HTTP. - **Cacheability**: Responses from the server can be cached by the client or intermediate proxies to improve performance. - **Uniform Interface**: Resources are identified in the request (usually via URIs), and actions are performed using standard HTTP methods. 5. **What is an API gateway and why is it used?** - **Answer**: An API gateway is a server that acts as an intermediary for requests from clients seeking resources from backend services. It provides various functions such as request routing, composition, protocol translation, and handling of cross-cutting concerns like authentication, authorization, logging, monitoring, and rate limiting. It simplifies the client interface and improves security, scalability, and manageability of API services. 6. **How do you ensure the security of an API?** - **Answer**: Ensuring API security involves several practices, including: - **Authentication**: Verify the identity of the user or system making the request (e.g., using OAuth, JWT). - **Authorization**: Ensure the authenticated user or system has permission to perform the requested action. - **Encryption**: Use HTTPS to encrypt data in transit between the client and server. - **Rate Limiting**: Prevent abuse by limiting the number of requests a client can make in a given time period. - **Input Validation**: Validate and sanitize all inputs to prevent injection attacks. - **Logging and Monitoring**: Track API usage and monitor for unusual or suspicious activity. 7. **What is versioning in APIs and how is it typically managed?** - **Answer**: API versioning is the practice of managing changes to an API without disrupting existing clients. It can be managed in several ways, including: - **URI Versioning**: Including the version number in the URI path (e.g., `/v1/resource`). - **Query Parameter Versioning**: Including the version number as a query parameter (e.g., `/resource?version=1`). - **Header Versioning**: Including the version number in the HTTP headers (e.g., `Accept: application/vnd.example.v1+json`). 8. **What are HTTP status codes and why are they important in API responses?** - **Answer**: HTTP status codes are standardized codes returned by a server to indicate the result of a client's request. They are important because they provide meaningful feedback to the client about what happened with their request. Common status codes include: - **200 OK**: The request was successful. - **201 Created**: A resource was successfully created. - **400 Bad Request**: The request was invalid or cannot be processed. - **401 Unauthorized**: Authentication is required and has failed or has not yet been provided. - **404 Not Found**: The requested resource could not be found. - **500 Internal Server Error**: An error occurred on the server. 9. **Explain the concept of idempotency in RESTful APIs.** - **Answer**: Idempotency refers to the property of certain operations whereby performing the same operation multiple times results in the same outcome. In RESTful APIs, methods like GET, PUT, and DELETE are idempotent because making the same request multiple times has the same effect as making it once. POST is not idempotent because multiple requests could create multiple resources. 10. **How do you handle pagination in APIs?** - **Answer**: Pagination is used to split large sets of data into manageable chunks. Common methods for handling pagination include: - **Offset and Limit**: Using query parameters to specify the starting point and number of records to return (e.g., `?offset=0&limit=10`). - **Page Number and Size**: Using query parameters to specify the page number and the number of records per page (e.g., `?page=1&size=10`). - **Cursor-Based Pagination**: Using a cursor (a pointer to a specific record) to fetch the next set of results (e.g., `?cursor=abc123`). These additional questions cover more advanced topics related to APIs, including security, design principles, best practices, and tooling. 11. **What is the difference between synchronous and asynchronous API calls?** - **Answer**: Synchronous API calls wait for the response before continuing, blocking the execution of code until the operation completes. Asynchronous API calls, on the other hand, do not block the execution; they allow the code to continue running and handle the response once it arrives, typically through callbacks, promises, or async/await patterns. 12. **What is a webhook, and how does it differ from an API endpoint?** - **Answer**: A webhook is a way for an application to provide other applications with real-time information. A webhook is a "callback" that allows the sending application to push data to the receiving application when an event occurs. Unlike traditional API endpoints, which require the client to periodically check for data (polling), webhooks enable the server to push data to the client when an event occurs. 13. **What is CORS, and why is it important in the context of APIs?** - **Answer**: CORS (Cross-Origin Resource Sharing) is a security feature implemented in web browsers that restricts web pages from making requests to a different domain than the one that served the web page. It is important in APIs to control how resources on a server are accessed by external domains. Proper CORS configuration ensures that only authorized domains can access API resources. 14. **What is the purpose of API documentation, and what should it include?** - **Answer**: API documentation provides developers with the information they need to use and integrate with an API effectively. It should include: - An overview of the API and its purpose. - Authentication and authorization methods. - Endpoint definitions and available methods (GET, POST, PUT, DELETE). - Request and response formats (including headers, query parameters, and body data). - Error codes and their meanings. - Examples of requests and responses. - Rate limits and usage policies. 15. **What are API gateways, and what role do they play in API management?** - **Answer**: API gateways act as intermediaries between clients and backend services. They provide various functions such as request routing, load balancing, security (authentication and authorization), rate limiting, logging, monitoring, and transforming requests and responses. API gateways simplify client interactions with microservices and help manage and secure APIs. 16. **How do you handle authentication and authorization in APIs?** - **Answer**: Authentication verifies the identity of a user or application, while authorization determines what resources and operations they have access to. Common methods for handling authentication and authorization in APIs include: - API keys: Simple tokens provided to access the API. - OAuth: An open standard for token-based authentication and authorization. - JWT (JSON Web Tokens): A compact, URL-safe means of representing claims to be transferred between two parties. - Basic Auth: A simple method using a username and password encoded in base64. 17. **What is the concept of rate limiting in APIs, and why is it important?** - **Answer**: Rate limiting controls the number of requests a client can make to an API within a specified time period. It is important for: - Preventing abuse and overuse of API resources. - Ensuring fair usage among clients. - Protecting the backend services from being overwhelmed. - Managing and maintaining service quality and performance. 18. **Explain the concept of API throttling.** - **Answer**: API throttling is the process of controlling the usage rate of an API by limiting the number of requests a client can make within a certain timeframe. Throttling helps prevent abuse, protects resources, and ensures that the service remains available and responsive to all users. It can be implemented using techniques such as rate limits, quotas, and burst control. 19. **What is HATEOAS and how does it relate to RESTful APIs?** - **Answer**: HATEOAS (Hypermedia As The Engine Of Application State) is a constraint of RESTful APIs where hypermedia links are included in the responses to guide clients through the API. It allows clients to dynamically discover available actions and navigate the API without hardcoding the structure. For example, a response to a GET request for a user resource might include links to update or delete the user. 20. **What are some common tools and platforms for testing and documenting APIs?** - **Answer**: Common tools and platforms for testing and documenting APIs include: - **Postman**: A popular tool for developing, testing, and documenting APIs. - **Swagger/OpenAPI**: A framework for designing, building, and documenting RESTful APIs, often used with tools like Swagger UI and Swagger Editor. - **Insomnia**: An API client for testing RESTful and GraphQL APIs. - **Apigee**: An API management platform providing tools for API design, security, analytics, and monitoring. - **Paw**: A macOS-based API client for testing and documenting APIs. - **RAML (RESTful API Modeling Language)**: A language for designing and documenting APIs. ## Message queues ### Distributed Message Queues ### Message Queues (Fifo) ### Caches ## Data Processing ### ETL ### Stream processing ### Batch processing ### Processing Frameworks #### Serverless #### Distributed Processing frameworks ### Scheduling #### Airflow ### Docker and Kubernetes ### CI-CD ## Data Storage ### Relational Databases ### NoSQL ### Analytical Stores ### Relational Modeling ### Dimensional Data Modeling ### Data Lakes ## Data Platforms ### AWS ### GCP ### Azure ### Snowflake Looking for a job or just want to know what people find important? In this chapter you can find a lot of interview questions we collect on the stream. Ultimately this should reach at least one thousand and one questions. **But Andreas, where are the answers??** Answers are for losers. I have been thinking a lot about this and the best way for you to prepare and learn is to look into these questions yourself. This cookbook or Google will help you a long way. Some questions we discuss directly on the live stream. Live Streams ------------ First live stream where we started to collect these questions. | Podcast Episode: #096 1001 Data Engineering Interview Questions |------------------| |First live stream where we collect and try to answer as many interview questions as possible. If this helps people and is fun we do this regularly until we reach 1000 and one. | [Watch on YouTube](https://youtu.be/WbqRH2r3N40) All Interview Questions ----------------------- The interview questions are roughly structured like the sections in the \"Basic data engineering skills\" part. This makes it easier to navigate this document. I still need to sort them accordingly. ### SQL DBs - What are windowing functions? - What is a stored procedure? - Why would you use them? - What are atomic attributes? - Explain ACID props of a database - How to optimize queries? - What are the different types of JOIN (CROSS, INNER, OUTER)? - What is the difference between Clustered Index and Non-Clustered Index - with examples? ### The Cloud - What is serverless? - What is the difference between IaaS, PaaS and SaaS? - How do you move from the ingest layer to the Cosumption layer? (In Serverless) - What is edge computing? - What is the difference between cloud and edge and on-premise? ### Linux - What is crontab? ### Big Data - What are the 4 V's? - Which one is most important? ### Kafka - What is a topic? - How to ensure FIFO? - How do you know if all messages in a topic have been fully consumed? - What are brokers? - What are consumergroups? - What is a producer? ### Coding - What is the difference between an object and a class? - Explain immutability - What are AWS Lambda functions and why would you use them? - Difference between library, framework and package - How to reverse a linked list - Difference between args and kwargs - Difference between OOP and functional programming ### NoSQL DBs - What is a key-value (rowstore) store? - What is a columnstore? - Diff between Row and col.store - What is a document store? - Difference between Redshift and Snowflake ### Hadoop - What file formats can you use in Hadoop? - What is the difference between a namenode and a datanode? - What is HDFS? - What is the purpose of YARN? ### Lambda Architecture - What is streaming and batching? - What is the upside of streaming vs batching? - What is the difference between lambda and kappa architecture? - Can you sync the batch and streaming layer and if yes how? ### Data Warehouse & Data Lake - What is a data lake? - What is a data warehouse? - Are there data lake warehouses? - Two data lakes within single warehouse? - What is a data mart? - What is a slow changing dimension (types)? - What is a surrogate key and why use them? ### APIs (REST) - What does REST mean? - What is idempotency? - What are common REST API frameworks (Jersey and Spring)? ### Apache Spark - What is an RDD? - What is a dataframe? - What is a dataset? - How is a dataset typesafe? - What is Parquet? - What is Avro? - Difference between Parquet and Avro - Tumbling Windows vs. Sliding Windows - Difference between batch and stream processing - What are microbatches? ### MapReduce - What is a use case of mapreduce? - Write a pseudo code for wordcount - What is a combiner? ### Docker & Kubernetes - What is a container? - Difference between Docker Container and a Virtual PC - What is the easiest way to learn kubernetes fast? ### Data Pipelines - What is an example of a serverless pipeline? - What is the difference between at most once vs at least once vs exactly once? - What systems provide transactions? - What is a ETL pipeline? ### Airflow - What is a DAG (in context of airflow/luigi)? - What are hooks/is a hook? - What are operators? - How to branch? ### DataVisualization - What is a BI tool? ### Security/Privacy - What is Kerberos? - What is a firewall? - What is GDPR? - What is anonymization? ### Distributed Systems - How clusters reach consensus (the answer was using consensus protocols like Paxos or Raft). Good I didnt have to explain paxos - What is the cap theorem / explain it (What factors should be considered when choosing a DB?) - How to choose right storage for different data consumers? It's always a tricky question ### Apache Flink - What is Flink used for? - Flink vs Spark? ### GitHub - What are branches? - What are commits? - What's a pull request? ### Dev/Ops - What is continuous integration? - What is continuous deployment? - Difference CI/CD ### Development / Agile - What is Scrum? - What is OKR? - What is Jira and what is it used for? ================================================ FILE: sections/09-BooksAndCourses.md ================================================ Recommended Books, Courses, and Podcasts ============================= ## Contents - [About Books and Courses](09-BooksAndCourses.md#about-books-and-courses) - [Books](09-BooksAndCourses.md#books) - [Languages](09-BooksAndCourses.md#books-languages) - [Data Science Tools](09-BooksAndCourses.md#books-data-science-tools) - [Business](09-BooksAndCourses.md#Books-Business) - [Community Recommendations](09-BooksAndCourses.md#Community-Recommendations) - [Online Courses](09-BooksAndCourses.md#Online-Courses) - [Preparation courses](09-BooksAndCourses.md#Preparation-courses) - [Data engineering courses](09-BooksAndCourses.md#Data-engineering-courses) - [Certifications](09-BooksAndCourses.md#Certifications) - [Podcasts](09-BooksAndCourses.md#Podcasts) ## About Books, Courses, and Podcasts This is a collection of books and courses I can recommend personally. They are great for every data engineering learner. I either have used or own these books during my professional work. I also looked into every online course personally. If you want to buy a book or course and support my work, please use one of my links below. They are all affiliate marketing links that help me fund this passion. Of course all this comes at no additional expense to you, but it helps me a lot. You can find even more interesting books and my whole podcast equipment on my Amazon store: [Go to the Amazon store](https://www.amazon.com/shop/plumbersofdatascience) PS: Don't just get a book and expect to learn everything - Course certificates alone help you nothing - Have a purpose in mind, like a small project - Great for use at work ## Books ### Languages #### Java [Learning Java: A Bestselling Hands-On Java Tutorial](https://amzn.to/2MgYp8h) #### Python [Learning Python, 5th Edition](https://amzn.to/2MdpM34) #### Scala [Programming Scala: Scalability = Functional Programming + Objects](https://amzn.to/2VIpww5) #### Swift [Learning Swift: Building Apps for macOS, iOS, and Beyond](https://amzn.to/31hDN4e) ### Data Science Tools #### Apache Spark [Learning Spark: Lightning-Fast Big Data Analysis](https://amzn.to/31mtAUg) #### Apache Kafka [Kafka Streams in Action: Real-time apps and microservices with the Kafka Streams API](https://amzn.to/35uiSOJ) #### Apache Hadoop [Hadoop: The Definitive Guide: Storage and Analysis at Internet Scale](https://amzn.to/2VNzf4n) #### Apache HBase [HBase: The Definitive Guide: Random Access to Your Planet-Size Data](https://amzn.to/2BbiyGz) ### Business #### The Lean Startup [The Lean Startup: How Today's Entrepreneurs Use Continuous Innovation to Create Radically Successful Businesses](https://amzn.to/2Meyv5e) #### Zero to One [Zero to One: Notes on Startups, or How to Build the Future](https://amzn.to/2BbBwgr) #### The Innovators Dilemma [The Innovator's Dilemma: When New Technologies Cause Great Firms to Fail (Management of Innovation and Change)](https://amzn.to/31eGZ0k) #### Crossing the Chasm [Crossing the Chasm, 3rd Edition (Collins Business Essentials)](https://amzn.to/2IU7QZs) #### Crush It! [Crush It!: Why Now Is The Time To Cash In On Your Passion](https://amzn.to/33xe7Su) ### Community Recommendations #### Designing Data-Intensive Applications "In my opinion, the knowledge contained in this book differentiates a data engineer from a software engineer or a developer. The book strikes a good balance between breadth and depth of discussion on data engineering topics, as well as the tradeoffs we must make due to working with massive amounts of data." -- David Lee on LinkedIn [Designing Data-Intensive Applications: The Big Ideas Behind Reliable, Scalable, and Maintainable Systems](https://amzn.to/2MIqTqJ) ## Online Courses ### Preparation courses | Course name | Course description | Course URL | |---|---|---| | The Bits and Bytes of Computer Networking | This course is designed to provide a full overview of computer networking. We’ll cover everything from the fundamentals of modern networking technologies and protocols to an overview of the cloud to practical applications and network troubleshooting. | https://www.coursera.org/learn/computer-networking | | Learn SQL \| Codecademy | In this SQL course, you'll learn how to manage large datasets and analyze real data using the standard data management language. | https://www.codecademy.com/learn/learn-sql | | Learn Python 3 \| Codecademy | Learn the basics of Python 3, one of the most powerful, versatile, and in-demand programming languages today. | https://www.codecademy.com/learn/learn-python-3 | ### Data engineering courses | Course name | Course description | Course URL | |---|---|---| | **1. Data Engineering Basics** | | | | Introduction to Data Engineering | Introduction to Data Engineering with over 1 hour of videos including my journey here. | https://learndataengineering.com/p/introduction-to-data-engineering | | Computer Science Fundamentals | A complete guide of topics and resources you should know as a Data Engineer. | https://learndataengineering.com/p/data-engineering-fundamentals | | Introduction to Python | Learn all the fundamentals of Python to start coding quick | https://learndataengineering.com/p/introduction-to-python | | Python for Data Engineers | Learn all the Python topics a Data Engineer needs even if you don't have a coding background | https://learndataengineering.com/p/python-for-data-engineers | | Docker Fundamentals | Learn all the fundamental Docker concepts with hands-on examples | https://learndataengineering.com/p/docker-fundamentals | | Successful Job Application | Everything you need to get your dream job in Data Engineering. | https://learndataengineering.com/p/successful-job-application | | Data Preparation & Cleaning for ML | All you need for preparing data to enable Machine Learning. | https://learndataengineering.com/p/data-preparation-and-cleaning-for-ml | | **2. Platform & Pipeline Design Fundamentals** | | | | Data Platform And Pipeline Design | Learn how to build data pipelines with templates and examples for Azure, GCP and Hadoop. | https://learndataengineering.com/p/data-pipeline-design | | Platform & Pipelines Security | Learn the important security fundamentals for Data Engineering | https://learndataengineering.com/p/platform-pipeline-security | | Choosing Data Stores | Learn the different types of data stores and when to use which. | https://learndataengineering.com/p/choosing-data-stores | | Schema Design Data Stores | Learn how to design schemas for SQL, NoSQL and Data Warehouses. | https://learndataengineering.com/p/data-modeling | | **3. Fundamental Tools** | | | | Building APIs with FastAPI | Learn the fundamentals of designing, creating and deploying APIs with FastAPI and Docker | https://learndataengineering.com/p/apis-with-fastapi-course | | Apache Kafka Fundamentals | Learn the fundamentals of Apache Kafka | https://learndataengineering.com/p/apache-kafka-fundamentals | | Apache Spark Fundamentals | Apache Spark quick start course in Python with Jupyter notebooks, DataFrames, SparkSQL and RDDs. | https://learndataengineering.com/p/learning-apache-spark-fundamentals | | Data Engineering on Databricks | Everything you need to get started with Databricks. From setup to building ETL pipelines & warehousing. | https://learndataengineering.com/p/data-engineering-on-databricks | | MongoDB Fundamentals | Learn how to use MongoDB | https://learndataengineering.com/p/mongodb-fundamentals-course | | Log Analysis with Elasticsearch | Learn how to monitor and debug your data pipelines | https://learndataengineering.com/p/log-analysis-with-elasticsearch | | Airflow Workflow Orchestration | Learn how to orchestrate your data pipelines with Apache Airflow | https://learndataengineering.com/p/learn-apache-airflow | | Snowflake for Data Engineers | Everything you need to get started with Snowflake | https://learndataengineering.com/p/snowflake-for-data-engineers | | dbt for Data Engineers | Everything you need to work with dbt and Snowflake | https://learndataengineering.com/p/dbt-for-data-engineers | | **4. Full Hands-On Example Projects** | | | | Data Engineering on AWS | Full 5 hours course with complete example project. Building stream and batch processing pipelines on AWS. | https://learndataengineering.com/p/data-engineering-on-aws | | Data Engineering on Azure | Ingest, Store, Process, Serve and Visualize Streams of Data by Building Streaming Data Pipelines in Azure. | https://learndataengineering.com/p/build-streaming-data-pipelines-in-azure | | Data Engineering on GCP | Everything you need to start with Google Cloud. | https://learndataengineering.com/p/data-engineering-on-gcp | | Modern Data Warehouses & Data Lakes | How to integrate a Data Lake with a Data Warehouse and query data directly from files | https://learndataengineering.com/p/modern-data-warehouses | | Machine Learning & Containerization On AWS | Build a app that analyzes the sentiment of tweets and visualizing them on a user interface hosted as container | https://learndataengineering.com/p/ml-on-aws | | Contact Tracing with Elasticsearch | Track 100,000 users in San Francisco using Elasticsearch and an interactive Streamlit user interface | https://learndataengineering.com/p/contact-tracing-with-elasticsearch | | Document Streaming Project | Document Streaming with FastAPI, Kafka, Spark Streaming, MongoDB and Streamlit | https://learndataengineering.com/p/document-streaming | | Storing & Visualizing Time Series Data with InfluxDB and Grafana | Learn how to use InfluxDB to store time series data and visualize interactive dashboards with Grafana | https://learndataengineering.com/p/time-series-influxdb-grafana | | Data Engineering with Hadoop | Hadoop Project with HDFS, YARN, MapReduce, Hive and Sqoop! | https://learndataengineering.com/p/data-engineering-with-hadoop | | Dockerized ETL | Learn how quickly set up a simple ETL script with AWS TDengine & Grafana | https://learndataengineering.com/p/timeseries-etl-with-aws-tdengine-grafana | ## Certifications Here's a list of great certifications that you can do on AWS and Azure. We left out GCP here, because the adoption of AWS and Azure is a lot higher and that's why I recommend to start with one of these. The costs are usually for doing the certification tests. We also added the level and prerequisites to make it easier for you make the decision which one fits for you. | Platform | Certification Name | Price | Level | Prerequisite Experience | URL | |----------|---------------------------------------------------------|-------|-------------|------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------| | AWS | AWS Certified Cloud Practitioner (maybe) | 100 | Beginner | Familiarity with the AWS platform is recommended but not required. | [Link](https://aws.amazon.com/certification/certified-cloud-practitioner/) | | AWS | AWS Certified Solutions Architect | 300 | Expert | AWS Certified Solutions Architect - Professional is intended for individuals with two or more years of hands-on experience designing and deploying cloud architecture on AWS. | [Link](https://aws.amazon.com/certification/certified-solutions-architect-professional/?ch=sec&sec=rmg&d=1) | | AWS | AWS Certified Solutions Architect | 150 | Intermediate| This is an ideal starting point for candidates with AWS Cloud or strong on-premises IT experience. This exam does not require deep hands-on coding experience, although familiarity with basic programming concepts would be an advantage. | [Link](https://aws.amazon.com/certification/certified-solutions-architect-associate/) | | AWS | AWS Certified Data Engineer | 150 | Intermediate| The ideal candidate for this exam has the equivalent of 2-3 years of experience in data engineering or data architecture and a minimum of 1-2 years of hands-on experience with AWS services. | [Link](https://aws.amazon.com/certification/certified-data-engineer-associate/) | | Azure | Microsoft Certified: Azure Cosmos DB Developer Specialty| 165 | Intermediate| | [Link](https://learn.microsoft.com/en-us/credentials/certifications/azure-cosmos-db-developer-specialty/?practice-assessment-type=certification) | | Azure | Microsoft Certified: Azure Data Engineer Associate - DP 203| 165 | Intermediate| | [Link](https://learn.microsoft.com/en-us/credentials/certifications/azure-data-engineer/?practice-assessment-type=certification) | | Azure | Microsoft Certified: Azure Data Fundamentals | 99 | Beginner | | [Link](https://learn.microsoft.com/en-us/credentials/certifications/azure-data-fundamentals/?practice-assessment-type=certification) | | Azure | Microsoft Certified: Azure Database Administrator Associate| 165 | Intermediate| | [Link](https://learn.microsoft.com/en-us/credentials/certifications/azure-database-administrator-associate/?practice-assessment-type=certification) | | Azure | Microsoft Certified: Azure Developer Associate | 165 | Intermediate| | [Link](https://learn.microsoft.com/en-us/credentials/certifications/azure-developer/?practice-assessment-type=certification) | | Azure | Microsoft Certified: Azure Fundamentals | 99 | Beginner | | [Link](https://learn.microsoft.com/en-us/credentials/certifications/azure-fundamentals/?practice-assessment-type=certification) | | Azure | Microsoft Certified: Azure Solutions Architect Expert | 165 | Expert | Microsoft Certified: Azure Administrator Associate certification | [Link](https://learn.microsoft.com/en-us/credentials/certifications/azure-solutions-architect/) | | Azure | Microsoft Certified: Fabric Analytics Engineer Associate| 165 | Intermediate| | [Link](https://learn.microsoft.com/en-us/credentials/certifications/fabric-analytics-engineer-associate/?practice-assessment-type=certification) | | Azure | Microsoft Certified: Fabric Data Engineer Associate | 165 | Intermediate| | [Link](https://learn.microsoft.com/en-us/credentials/certifications/fabric-data-engineer-associate/) | | Azure | Microsoft Certified: Power BI Data Analyst Associate | 165 | Intermediate| | [Link](https://learn.microsoft.com/en-us/credentials/certifications/data-analyst-associate/?practice-assessment-type=certification) | ## Podcasts Top five podcasts by the number of episodes created. ### Super Data Science [The latest machine learning, A.I., and data career topics from across both academia and industry are brought to you by host Dr. Jon Krohn on the Super Data Science Podcast.](https://podcasts.apple.com/us/podcast/super-data-science/id1163599059) ### Data Skeptic [The Data Skeptic Podcast features interviews and discussion of topics related to data science, statistics, machine learning, artificial intelligence and the like, all from the perspective of applying critical thinking and the scientific method to evaluate the veracity of claims and efficacy of approaches.](https://podcasts.apple.com/us/podcast/data-skeptic/id890348705) ### Data Engineering Podcast [This show goes behind the scenes for the tools, techniques, and difficulties associated with the discipline of data engineering. Databases, workflows, automation, and data manipulation are just some of the topics that you will find here.](https://podcasts.apple.com/us/podcast/data-engineering-podcast/id1193040557?mt=2) ### Roaring Elephant BiteSized Big Tech [A weekly community podcast about Big Technology with a focus on Open Source, Advanced Analytics and other modern magic.](https://roaringelephant.org/) ### SQL Data Partners Podcast [Hosted by Carlos L Chacon, the SQL Data Partners Podcast focuses on Microsoft data platform related topics mixed with a sprinkling of professional development. Carlos and guests discuss new and familiar features and ideas and how you might apply them in your environments.](https://podcasts.apple.com/us/podcast/sql-data-partners-podcast/id1027394388) ### Complete list | Host name | Podcast name | Access podcast | |-------------------------|----------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------| | Jon Krohn | Super Data Science | https://www.superdatascience.com/podcast | | Kyle Polich | Data Skeptic | https://dataskeptic.com/ | | Tobias Macey | Data Engineering Podcast | https://www.dataengineeringpodcast.com/ | | Dave Russell | Roaring Elephant - Bite-Sized Big Tech | https://roaringelephant.org/ | | Carlos L Chacon | SQL Data Partners Podcast | https://sqldatapartners.com/podcast/ | | Jason Himmelstein | BIFocal - Clarifying Business Intelligence | https://bifocal.show/ | | Scott Hirleman | Data Mesh Radio | https://daappod.com/data-mesh-radio/ | | Jonathan Schwabish | PolicyViz | https://policyviz.com/podcast/ | | Al Martin | Making Data Simple | https://www.ibm.com/blogs/journey-to-ai/2021/02/making-data-simple-this-week-we-continue-our-discussion-on-data-framework-and-what-is-meant-by-data-framework/ | | John David Ariansen | How to Get an Analytics Job | https://www.silvertoneanalytics.com/how-to-get-an-analytics-job/ | | Moritz Stefaner | Data Stories | https://datastori.es/ | | Hilary Parker | Not So Standard Deviations | https://nssdeviations.com/ | | Ben Lorica | The Data Exchange with Ben Lorica | https://thedataexchange.media/author/bglorica/ | | Juan Sequeda | Catalog & Cocktails | https://data.world/resources/podcasts/ | | Wayne Eckerson | Secrets of Data Analytics Leaders | https://www.eckerson.com/podcasts/secrets-of-data-analytics-leaders | | Guy Glantser | SQL Server Radio | https://www.sqlserverradio.com/ | | Eitan Blumin | SQL Server Radio | https://www.sqlserverradio.com/ | | Jason Tan | The Analytics Show | https://ddalabs.ai/the-analytics-show/ | | Hugo Bowne-Anderson | DataFramed | https://www.datacamp.com/podcast | | Kostas Pardalis | The Data Stack Show | https://datastackshow.com/ | | Eric Dodds | The Data Stack Show | https://datastackshow.com/ | | Catherine King | The Business of Data Podcast | https://podcasts.apple.com/gb/podcast/the-business-of-data-podcast/id1528796448 | | | The Business of Data | https://business-of-data.com/podcasts/ | | James Le | Datacast | https://datacast.simplecast.com/ | | Mike Delgado | DataTalk | https://podcasts.apple.com/us/podcast/datatalk/id1398548129 | | Matt Housley | Monday Morning Data Chat | https://podcasts.apple.com/us/podcast/monday-morning-data-chat/id1565154727 | | Francesco Gadaleta | Data Science at Home | https://datascienceathome.com/ | | Alli Torban | Data Viz Today | https://dataviztoday.com/ | | Steve Jones | Voice of the DBA | https://voiceofthedba.com/ | | Lea Pica | The Present Beyond Measure Show: Data Storytelling, Presentation & Visualization | https://leapica.com/podcast/ | | Samir Sharma | The Data Strategy Show | https://podcasts.apple.com/us/podcast/the-data-strategy-show/id1515194422 | | Cindi Howson | The Data Chief | https://www.thoughtspot.com/data-chief/podcast | | Cole Nussbaumer Knaflic | storytelling with data podcast | https://storytellingwithdata.libsyn.com/ | | Margot Gerritsen | Women in Data Science | https://www.widsconference.org/podcast.html | | Jonas Christensen | Leaders of Analytics | https://www.leadersofanalytics.com/episode/the-future-of-analytics-leadership-with-john-thompson | | Matt Brady | ZUMA: Data For Good | https://www.youtube.com/@zuma-dataforgood | | Julia Schottenstein | The Analytics Engineering Podcast | https://roundup.getdbt.com/s/the-analytics-engineering-podcast | | | Data Unlocked | https://dataunlocked.buzzsprout.com/ | | Boris Jabes | The Sequel Show | https://www.thesequelshow.com/ | | | Data Radicals | https://www.alation.com/podcast/ | | Nicola Askham | The Data Governance | https://www.nicolaaskham.com/podcast | | Boaz Farkash | The Data Engineering Show | https://www.dataengineeringshow.com/ | | Bob Haffner | The Engineering Side of Data | https://podcasts.apple.com/us/podcast/the-engineering-side-of-data/id1566999533 | | Dan Linstedt | Data Vault Alliance | https://datavaultalliance.com/category/news/podcasts/ | | Dustin Schimek | Data Ideas | https://podcasts.apple.com/us/podcast/data-ideas/id1650322207 | | Alex Merced | The datanation | https://podcasts.apple.com/be/podcast/the-datanation-podcast-podcast-for-data-engineers/id1608638822 | | Thomas Bustos | Let's Talk AI | https://www.youtube.com/@lets-talk-ai | | Jahanvee Narang | Decoding Data Analytics | https://www.youtube.com/@decodingdataanalytics/videos | ================================================ FILE: sections/10-Updates.md ================================================ Updates ============ What's new? Here you can find a list of all the updates with links to the sections - **2025-07-21** - Added a list of my students favorite datasets and APIs [click here](07-DataSources.md#Student-Favorites) - **2025-06-11** - Released the first playable demo of the Spark Optimization Playground [click here](https://bit.ly/play-spark-optimization) - **2025-03-25** - Added a detailed 14-week roadmap to Data Engineering for Data Scientists [click here](01-Introduction.md#roadmap-for-data-scientists) - **2025-03-05** - Added a detailed 11-week roadmap to Data Engineering for Beginners [click here](01-Introduction.md#roadmap-for-beginners) - **2025-03-04** - Added a detailed 10-week roadmap to Data Engineering for Data Analysts [click here](01-Introduction.md#roadmap-for-data-analysts) - **2024-12-11** - Prepared the 81 most important questions for platform & pipeline design. Specifically looking at the data source and the goals [click here](03-AdvancedSkills.md#81-platform-and-pipeline-design-questions) - **2024-11-28** - Prepared a GenAI RAG example project that you can run on your own computer without internet. It uses Ollama with Mistral model and Elasticsearch. Working on a way of creating embeddings from pdf files and inserting them into Elsaticsearch for queries [click here](04-HandsOnCourse.md#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch) - **2024-11-23** - Added an overview of AWS and Azure cloud certifications for Data Engineers. From beginners to experts [click here](09-BooksAndCourses.md#Certifications) - **2024-07-31** - Added 10 platform architecture react videos I did to the "Best Practices" section. This way you get a better feeling of what companies are doing and which tools they use [click here](06-BestPracticesCloud.md#best-practices) - **2024-07-17** - Added 20 API interview questoins and their answers [click here](08-InterviewQuestions.md#apis) - Added 10 Python interview questions and their answers [click here](03-AdvancedSkills.md#python) - **2024-07-08** - Added large article about Snowflake and dbt for Data Engineers [click here](03-AdvancedSkills.md#analytical-data-stores) - Added new secton "Analytical Data Stores" to Advanced skills with the Snowflake & dbt infos. - Put SQL and NoSQL datastores into a new section "Transactional Data Stores" - **2024-03-20** - Added roadmap for Software Engineers / Computer Scientists [click here](01-Introduction.md#roadmap-for-software-engineers) - Added many questions and answers from my interview on the Super Data Science Podcast (plus links to YouTube and the Podcast) [click here](01-Introduction.md#Interview-with-Andreas-on-the-Super-Data-Science-Podcast) - **2024-03-13** - Added "How to become a Senior Data Engineer" live stream series as a blog post with images shown in the live streams and the links to the videos. [click here](01-Introduction.md#how-to-become-a-senior-data-engineer) - **2024-03-08** - Included Data Engineering skills matrix into the introduction with link to the live stream. [click here](01-Introduction.md#data-engineers-skills-matrix) - **2024-03-01** - Added updates section - Reworked the Hands-on courses section with 5 free courses / tutorials from Andreas on YouTube [click here](04-HandsOnCourse.md) - **2024-02-28** - Added Data Engineering Roadmap for Data Scientists: [click here](01-Introduction.md#roadmap-for-data-scientists) - **2024-02-25** - Data Engineering Roadmap for Software Engineers: [click here](01-Introduction.md#roadmap-for-software-engineers) - **2024-02-20** - Data Engineering Roadmap for Data Analysts: [click here](01-Introduction.md#roadmap-for-data-analysts)