Repository: andkret/Cookbook
Branch: master
Commit: d58e9a70e031
Files: 24
Total size: 595.6 KB
Directory structure:
gitextract_czkzynip/
├── .github/
│ └── workflows/
│ ├── copy-to-documenation-branch.yml
│ └── linkchecker.yml
├── .gitignore
├── Code Examples/
│ ├── #102 Spark Week Day 3.txt
│ ├── GenAI-RAG/
│ │ ├── conversations.json
│ │ ├── cvpipeline.py
│ │ ├── docker-compose.yml
│ │ ├── index.py
│ │ └── query.py
│ └── Movies.txt
├── FUNDING.yml
├── LICENSE
├── README.md
├── images/
│ └── Data-Engineering-Roadmap-for.textClipping
└── sections/
├── 01-Introduction.md
├── 02-BasicSkills.md
├── 03-AdvancedSkills.md
├── 04-HandsOnCourse.md
├── 05-CaseStudies.md
├── 06-BestPracticesCloud.md
├── 07-DataSources.md
├── 08-InterviewQuestions.md
├── 09-BooksAndCourses.md
└── 10-Updates.md
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/copy-to-documenation-branch.yml
================================================
name: Copy from master to documentation branch
# Controls when the action will run.
on:
# Triggers the workflow on push request events but only for the master branch
push:
branches: [master]
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
jobs:
copy-images:
runs-on: ubuntu-latest
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v2
- name: Copy Images
uses: andstor/copycat-action@v3
with:
personal_token: ${{ secrets.ACTION_TOKEN }}
src_branch: master
src_path: /images/.
dst_owner: andkret
dst_repo_name: Cookbook
dst_path: /static/images/
dst_branch: documentation
clean: true
commit_message: "Images copied from master to documentation branch!"
copy-sections:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Copy Markdowns
uses: andstor/copycat-action@v3
with:
personal_token: ${{ secrets.ACTION_TOKEN }}
src_branch: master
src_path: /sections/.
dst_owner: andkret
dst_repo_name: Cookbook
dst_path: /docs/
dst_branch: documentation
clean: true
commit_message: "Sections copied from master to documentation branch!"
# copy-readme:
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v2
# - name: Copy Markdowns
# uses: andstor/copycat-action@v3
# with:
# personal_token: ${{ secrets.ACTION_TOKEN }}
# src_branch: master
# src_path: README.md
# dst_owner: andkret
# dst_repo_name: Cookbook
# dst_path: /docs/00-TableOfContents.md
# dst_branch: documentation
# clean: false
# commit_message: "Readme copied from master to documentation branch!"
# copy-readme:
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v2
# - name: Copy Markdowns
# uses: andstor/copycat-action@v3
# with:
# personal_token: ${{ secrets.PERSONAL_TOKEN }}
# src_branch: master
# src_path: /README.md
# dst_owner: andkret
# dst_repo_name: Cookbook
# dst_path: /docs/
# dst_branch: documentation
# commit_message: "README.md copied from master to documentation branch!"
================================================
FILE: .github/workflows/linkchecker.yml
================================================
#on:
# schedule:
# - cron: '0 9 * * 1'
# workflow_dispatch:
#jobs:
# linkChecker:
# runs-on: ubuntu-latest
# steps:
# - name: update setuptools
# run: |
# python3 -m pip install --upgrade pip setuptools wheel
# - uses: actions/checkout@v2
# - name: Link Checker
# uses: lycheeverse/lychee-action@master
# with:
# args: --verbose --no-progress --accept 200,204,206,406,429,999 --include-mail ./sections/*.md
# - name: Create Issue From File
# uses: peter-evans/create-issue-from-file@v5
# with:
# title: Link Checker Report
# content-filepath: ./lychee/out.md
# labels: report, automated issue
================================================
FILE: .gitignore
================================================
# Ignore build artefacts
*.aux
*.log
*.lof
*.lot
*.toc
*.out
*.synctex.gz
node_modules/*
================================================
FILE: Code Examples/#102 Spark Week Day 3.txt
================================================
//Read in the textfile
val input = sc.textFile("/notebook/Movies.txt")
case class MovieLine(Line: String)
val movieline = input.map(line => MovieLine(line))
movieline.toDF().registerTempTable("MovieLine")
// Lets map the date and the genre
case class DateAndGenre(myDate: String, Genre: String)
val dateandgenre = input.map(line => line.split(";")).map(s => DateAndGenre( s(0),s(3) ))
dateandgenre.toDF().registerTempTable("DateAndGenre")
// count how many movies per year
case class MovieDate(Line: String, myCount: Int)
val countdate = input.map(line => line.split(";")).map(s => (s(0),1))
countdate.toDF().registerTempTable("countdate")
val reduceddate = countdate.reduceByKey((a,b) => a + b).map(s => MovieDate(s._1,s._2))
reduceddate.toDF().registerTempTable("MovieDate")
//flatten every word into a new line in the RDD
val flatmappedinput = input.flatMap(line => line.split(";") )
flatmappedinput.toDF().registerTempTable("flatinput")
// read input directly to dataframe
val inputasdf = spark.read.format("csv").option("header", "true").option("delimiter", ";").load("/notebook/Movies.txt")
inputasdf.registerTempTable("inputdf")
/* //Use this to store the dataframe as parquet on the local drive
val reduceddf = reduceddate.toDF()
reduceddf.write.parquet("/notebook/movie.parquet")
*/
//read the parquetfile
val parquetFileDF = spark.read.parquet("/notebook/movie.parquet")
parquetFileDF.registerTempTable("ParquetRead")
//SparkSQL Queries:
//Visualize the raw RDD
%sql select * from MovieLine
//Visualize the map reduced RDD with count of movies per year
%sql select Line, myCount from MovieDate order by myCount desc
//Visualize the maped RDD and count the nr. of movies per year in SparkSQL
%sql select myDate, count(myDate) as counted from DateAndGenre group by myDate order by counted desc
%sql select * from flatinput
%sql select * from ParquetRead
================================================
FILE: Code Examples/GenAI-RAG/conversations.json
================================================
[
{
"conversation_id": 456,
"customer_name": "Alice Brown",
"agent_name": "Emily Johnson",
"policy_number": "ABC5678",
"conversation": "Customer: Hi, my name is Alice Brown. Date of Birth is September 20th, 1980, Address is 456 Oak St, Springfield, IL 62701, and my Policy Number is XYZ9876543.\nAgent: Good afternoon, Alice. How may I assist you today?\nCustomer: Hello, Emily. I have a question regarding my coverage.\nCustomer: My kitchen caught fire, and I'm concerned about the damages.\nAgent: I'm sorry to hear that, Alice. Let me review your policy for fire damage coverage.\nAgent: It appears that fire damage is covered under your policy. We'll assist you with the claim process.\nCustomer: Thank you, Emily. I appreciate your help during this stressful time.\nAgent: You're welcome, Alice. We're here to support you. Please don't hesitate to reach out if you need further assistance.\nCustomer: I'll keep that in mind. Have a great day!\nAgent: You too, Alice. Take care.",
"summary": "A customer inquires about policy coverage after a kitchen fire, expressing concern, and the agent confirms coverage and offers assistance, providing support and reassurance throughout the conversation."
},
{
"conversation_id": 789,
"customer_name": "David Johnson",
"agent_name": "Sarah Wilson",
"policy_number": "LMN9012",
"conversation": "Customer: Good morning, I'm David Johnson. My Date of Birth is May 5th, 1975, Address is 789 Maple Ave, Seattle, WA 98101, and my Policy Number is PQR3456789.\nAgent: Good morning, David. How can I assist you today?\nCustomer: Hi, Sarah. I'm concerned about my home insurance coverage.\nCustomer: A pipe burst in my basement, and there's significant water damage.\nAgent: I'm sorry to hear that, David. Let me check your policy for coverage related to water damage.\nAgent: It seems that water damage from burst pipes is covered under your policy.\nCustomer: That's a relief. I'll need to file a claim as soon as possible.\nAgent: We'll assist you with the claim process, David. Is there anything else I can help you with?\nCustomer: No, that's all for now. Thank you for your assistance, Sarah.\nAgent: You're welcome, David. Please feel free to reach out if you have any further questions or concerns.\nCustomer: I will. Have a great day!\nAgent: You too, David. Take care.",
"summary": "A customer expresses concern about home insurance coverage due to water damage from a burst pipe, and the agent confirms coverage, offering assistance with the claim process, resulting in relief and gratitude expressed by the customer."
},
{
"conversation_id": 101,
"customer_name": "Emily Green",
"agent_name": "Jack Smith",
"policy_number": "DEF4567",
"conversation": "Customer: Hi there, I'm Emily Green. My Date of Birth is April 10th, 1988, Address is 101 Pine St, Boston, MA 02101, and my Policy Number is DEF4567.\nAgent: Hello, Emily. How can I assist you today?\nCustomer: Hi, Jack. I have a question about my policy.\nCustomer: A window in my living room shattered during a storm. Is this covered?\nAgent: Let me check your policy for coverage related to storm damage.\nAgent: Unfortunately, damage to windows from storms is not covered under your policy.\nCustomer: Oh, that's disappointing. Is there any way to add coverage for this?\nAgent: Yes, we offer endorsements for specific perils like storm damage to windows. I can provide you with more information on that.\nCustomer: Please do. I want to ensure I'm protected in case this happens again.\nAgent: I'll send you an email with details on our endorsement options. Feel free to reach out if you have any further questions.\nCustomer: Thank you, Jack. I appreciate your help.\nAgent: You're welcome, Emily. Have a great day!",
"summary": "A customer inquires about coverage for a shattered window after a storm, but it's not covered under the policy. The agent suggests adding endorsements for specific perils like storm damage to windows, providing further information and assistance, resulting in the customer's appreciation."
},
{
"conversation_id": 102,
"customer_name": "Michael White",
"agent_name": "Sarah Johnson",
"policy_number": "GHI7890",
"conversation": "Customer: Good afternoon, I'm Michael White. My Date of Birth is February 25th, 1970, Address is 202 Elm St, Chicago, IL 60601, and my Policy Number is GHI7890.\nAgent: Good afternoon, Michael. How may I assist you today?\nCustomer: Hi, Sarah. I have a question about my policy coverage.\nCustomer: My roof has started leaking after heavy rainfall. Will my insurance cover repairs?\nAgent: Let me review your policy for coverage related to roof leaks.\nAgent: Roof leaks due to rain are typically covered under your policy.\nCustomer: That's a relief. I'll need to schedule repairs as soon as possible.\nAgent: We'll assist you with the claim process, Michael. Is there anything else I can help you with?\nCustomer: No, that's all for now. Thank you for your assistance, Sarah.\nAgent: You're welcome, Michael. Please feel free to reach out if you have any further questions or concerns.\nCustomer: I will. Have a great day!\nAgent: You too, Michael. Take care.",
"summary": "A customer seeks clarification on policy coverage for a leaking roof after heavy rainfall, and the agent confirms that such damages are typically covered under the policy. The agent offers assistance with the claim process, resulting in the customer expressing relief and gratitude."
},
{
"conversation_id": 103,
"customer_name": "Sophia Jones",
"agent_name": "Emily Wilson",
"policy_number": "JKL0123",
"conversation": "Customer: Hi, I'm Sophia Jones. My Date of Birth is November 15th, 1985, Address is 303 Cedar St, Miami, FL 33101, and my Policy Number is JKL0123.\nAgent: Hello, Sophia. How may I assist you today?\nCustomer: Hello, Emily. I have a question about my policy.\nCustomer: There's been a break-in at my home, and some valuable items are missing. Are they covered?\nAgent: Let me check your policy for coverage related to theft.\nAgent: Yes, theft of personal belongings is covered under your policy.\nCustomer: That's a relief. I'll need to file a claim for the stolen items.\nAgent: We'll assist you with the claim process, Sophia. Is there anything else I can help you with?\nCustomer: No, that's all for now. Thank you for your assistance, Emily.\nAgent: You're welcome, Sophia. Please feel free to reach out if you have any further questions or concerns.\nCustomer: I will. Have a great day!\nAgent: You too, Sophia. Take care.",
"summary": "A customer inquires about coverage for stolen items after a break-in at home, and the agent confirms that theft of personal belongings is covered under the policy. The agent offers assistance with the claim process, resulting in the customer expressing relief and gratitude."
},
{
"conversation_id": 104,
"customer_name": "Ethan Wilson",
"agent_name": "Jack Brown",
"policy_number": "MNO3456",
"conversation": "Customer: Hello, I'm Ethan Wilson. My Date of Birth is July 5th, 1995, Address is 404 Oak St, Los Angeles, CA 90001, and my Policy Number is MNO3456.\nAgent: Good morning, Ethan. How may I assist you today?\nCustomer: Hi, Jack. I have a question regarding my policy.\nCustomer: My garage door was damaged in a storm. Is this covered?\nAgent: Let me review your policy for coverage related to storm damage.\nAgent: Yes, damage to the garage door from storms is covered under your policy.\nCustomer: That's a relief. I'll need to schedule repairs as soon as possible.\nAgent: We'll assist you with the claim process, Ethan. Is there anything else I can help you with?\nCustomer: No, that's all for now. Thank you for your assistance, Jack.\nAgent: You're welcome, Ethan. Please feel free to reach out if you have any further questions or concerns.\nCustomer: I will. Have a great day!\nAgent: You too, Ethan. Take care.",
"summary": "A customer inquires about coverage for a damaged garage door after a storm, and the agent confirms that such damages are covered under the policy. The agent offers assistance with the claim process, resulting in the customer expressing relief and gratitude."
},
{
"conversation_id": 105,
"customer_name": "Olivia Taylor",
"agent_name": "Sarah Smith",
"policy_number": "PQR7890",
"conversation": "Customer: Hi there, I'm Olivia Taylor. My Date of Birth is December 30th, 1990, Address is 505 Pine St, San Francisco, CA 94101, and my Policy Number is PQR7890.\nAgent: Good afternoon, Olivia. How may I assist you today?\nCustomer: Hi, Sarah. I have a question regarding my policy.\nCustomer: A tree in my backyard has fallen and damaged my fence. Will my insurance cover repairs?\nAgent: Let me check your policy for coverage related to fallen trees.\nAgent: Yes, damage to the fence from fallen trees is covered under your policy.\nCustomer: That's a relief. I'll need to schedule repairs as soon as possible.\nAgent: We'll assist you with the claim process, Olivia. Is there anything else I can help you with?\nCustomer: No, that's all for now. Thank you for your assistance, Sarah.\nAgent: You're welcome, Olivia. Please feel free to reach out if you have any further questions or concerns.\nCustomer: I will. Have a great day!\nAgent: You too, Olivia. Take care.",
"summary": "A customer inquires about coverage for a damaged fence due to a fallen tree, and the agent confirms that such damages are covered under the policy. The agent offers assistance with the claim process, resulting in the customer expressing relief and gratitude."
},
{
"conversation_id": 106,
"customer_name": "William Anderson",
"agent_name": "Jack Johnson",
"policy_number": "STU2345",
"conversation": "Customer: Hello, I'm William Anderson. My Date of Birth is August 20th, 1980, Address is 606 Elm St, Dallas, TX 75201, and my Policy Number is STU2345.\nAgent: Good morning, William. How may I assist you today?\nCustomer: Hi, Jack. I have a question about my policy.\nCustomer: My basement flooded during heavy rainfall. Is water damage covered?\nAgent: Let me review your policy for coverage related to water damage.\nAgent: Yes, water damage from flooding is covered under your policy.\nCustomer: That's a relief. I'll need to schedule repairs as soon as possible.\nAgent: We'll assist you with the claim process, William. Is there anything else I can help you with?\nCustomer: No, that's all for now. Thank you for your assistance, Jack.\nAgent: You're welcome, William. Please feel free to reach out if you have any further questions or concerns.\nCustomer: I will. Have a great day!\nAgent: You too, William. Take care.",
"summary": "A customer inquires about coverage for water damage after a basement flooding, and the agent confirms that such damages are covered under the policy. The agent offers assistance with the claim process, resulting in the customer expressing relief and gratitude."
},
{
"conversation_id": 123,
"customer_name": "Alice Smith",
"agent_name": "Emily Johnson",
"policy_number": "ABC5678",
"conversation": "Customer: Hi, my name is Alice Smith, Date of Birth is Feb 15th 1985, Address is 123 Main St, Anytown, NY 12345, and my Policy Number is XYZ9876.\nAgent: Hello, Alice. How can I assist you today?\nCustomer: I have a question about my home insurance coverage.\nCustomer: I noticed some water damage in my basement, and I'm not sure if it's covered.\nAgent: I'm sorry to hear about the damage. Let me review your policy to see what's covered.\nAgent: Based on your policy, water damage from burst pipes is covered, but it depends on the cause of the damage.\nCustomer: What if it's from heavy rainfall or flooding?\nAgent: Unfortunately, damage from flooding is typically not covered under standard home insurance policies.\nCustomer: That's disappointing. Is there anything I can do to get coverage for flooding?\nAgent: You may want to consider purchasing a separate flood insurance policy to ensure you're protected.\nCustomer: I see. Thank you for your help.\nAgent: You're welcome, Alice. If you have any further questions, feel free to ask.",
"summary": "A customer inquires about home insurance coverage for water damage in the basement, and the agent confirms that damage from burst pipes is covered but explains that flooding is typically not covered under standard policies. The agent advises the customer to consider purchasing a separate flood insurance policy for protection, resulting in the customer expressing gratitude for the assistance provided."
},
{
"conversation_id": 124,
"customer_name": "Michael Johnson",
"agent_name": "Sarah Brown",
"policy_number": "DEF1234",
"conversation": "Customer: Hi there, my name is Michael Johnson, Date of Birth is May 10th 1978, Address is 456 Oak St, Smalltown, CA 98765, and my Policy Number is QRS5678.\nAgent: Good afternoon, Michael. How can I help you today?\nCustomer: I'm having an issue with my home insurance policy.\nCustomer: There's been some damage to my roof due to a recent storm, and I'm not sure if it's covered.\nAgent: I'm sorry to hear about the damage. Let me check your policy to provide you with accurate information.\nAgent: According to your policy, damage caused by storms, including wind and hail damage to your roof, should be covered.\nCustomer: That's a relief to hear. What do I need to do next?\nAgent: You'll need to file a claim with your insurance company and provide documentation of the damage, such as photos or repair estimates.\nCustomer: Okay, I'll get started on that right away.\nAgent: If you need any assistance with the claims process, feel free to reach out to us for help.\nCustomer: Thank you for your assistance.\nAgent: You're welcome, Michael. Have a great day!",
"summary": "A customer reports damage to their roof caused by a recent storm and seeks clarification on coverage under their home insurance policy. The agent confirms that such damage is typically covered, advises the customer to file a claim with the insurance company, and offers assistance with the claims process, resulting in the customer expressing gratitude for the assistance provided."
},
{
"conversation_id": 125,
"customer_name": "Jennifer Brown",
"agent_name": "David Wilson",
"policy_number": "GHI7890",
"conversation": "Customer: Hello, I'm Jennifer Brown, born on March 20th, 1980, residing at 789 Elm St, Suburbia, TX 54321, and my Policy Number is LMN9012.\nAgent: Good morning, Jennifer. How can I assist you today?\nCustomer: Hi, I have a question about my home insurance coverage.\nCustomer: A pipe burst in my kitchen, and there's water damage everywhere.\nAgent: I'm sorry to hear about the incident. Let me check your policy to see what's covered.\nAgent: Based on your policy, sudden and accidental water damage, including burst pipes, should be covered.\nCustomer: That's a relief. What should I do next?\nAgent: You'll need to file a claim with your insurance company and provide documentation of the damage.\nCustomer: Okay, I'll do that right away. Thank you for your help.\nAgent: You're welcome, Jennifer. If you have any further questions, feel free to reach out.",
"summary": "A customer reports water damage in the kitchen due to a burst pipe and seeks clarification on coverage under their home insurance policy. The agent confirms that sudden and accidental water damage, including burst pipes, should be covered, advises the customer to file a claim with the insurance company, and offers further assistance, resulting in the customer expressing gratitude for the help provided."
},
{
"conversation_id": 126,
"customer_name": "Robert Johnson",
"agent_name": "Michelle Adams",
"policy_number": "PQR3456",
"conversation": "Customer: Hi, my name is Robert Johnson, DOB is July 5th, 1976, and I live at 456 Maple Ave, Cityville, OH 67890. My Policy Number is STU2345.\nAgent: Hello, Robert. How can I assist you today?\nCustomer: I have a concern about my home insurance policy.\nCustomer: My neighbor's tree fell on my fence during the storm, causing damage.\nAgent: I'm sorry to hear about the damage. Let me review your policy to see if it's covered.\nAgent: Unfortunately, damage caused by your neighbor's tree falling on your fence may not be covered under your policy.\nCustomer: That's disappointing. Is there anything I can do to get coverage?\nAgent: You may want to speak with your neighbor about their homeowner's insurance policy, as their coverage may apply to this situation.\nCustomer: I'll do that. Thank you for your assistance.\nAgent: You're welcome, Robert. If you have any further questions, don't hesitate to ask.",
"summary": "A customer expresses concern about damage to their fence caused by a neighbor's tree falling during a storm and seeks clarification on coverage under their home insurance policy. The agent advises that such damage may not be covered under the customer's policy and suggests contacting the neighbor's homeowner's insurance for potential coverage, resulting in the customer expressing gratitude for the assistance provided."
},
{
"conversation_id": 127,
"customer_name": "Emily Davis",
"agent_name": "Daniel Miller",
"policy_number": "UVW4567",
"conversation": "Customer: Hi, I'm Emily Davis, born on September 12th, 1982, residing at 789 Pine St, Hilltown, FL 45678. My Policy Number is XYZ7890.\nAgent: Good afternoon, Emily. How can I assist you today?\nCustomer: Hello, I need to make a change to my home insurance policy.\nCustomer: I recently renovated my kitchen, and I need to update the coverage to reflect the changes.\nAgent: I can assist you with that. Let me update your policy with the new information.\nAgent: Your policy has been updated to reflect the renovation. Is there anything else I can help you with?\nCustomer: That's all for now. Thank you for your help.\nAgent: You're welcome, Emily. If you have any further questions or need assistance in the future, feel free to reach out.",
"summary": "A customer requests a change to their home insurance policy to reflect recent renovations to their kitchen. The agent assists with updating the policy accordingly, and the customer expresses gratitude for the help provided."
},
{
"conversation_id": 128,
"customer_name": "Jessica Wilson",
"agent_name": "Ryan Thompson",
"policy_number": "WXY6789",
"conversation": "Customer: Hello, I'm Jessica Wilson, DOB is April 30th, 1974, and I live at 234 Oak St, Suburbia, CA 98765. My Policy Number is ABC1234.\nAgent: Good morning, Jessica. How can I assist you today?\nCustomer: Hi, I need to add an additional coverage to my home insurance policy.\nCustomer: I recently purchased some expensive jewelry, and I want to make sure it's covered in case of theft or loss.\nAgent: I can help you with that. Let me add a rider to your policy to cover the additional jewelry.\nAgent: Your policy has been updated to include coverage for your jewelry. Is there anything else I can assist you with?\nCustomer: That's all for now. Thank you for your help.\nAgent: You're welcome, Jessica. If you have any further questions or need assistance in the future, feel free to reach out.",
"summary": "A customer requests to add additional coverage to their home insurance policy for recently purchased expensive jewelry to ensure protection against theft or loss. The agent assists by adding a rider to the policy for the additional coverage, and the customer expresses gratitude for the help provided."
},
{
"conversation_id": 129,
"customer_name": "Andrew Brown",
"agent_name": "Sophia Martinez",
"policy_number": "JKL2345",
"conversation": "Customer: Hi there, I'm Andrew Brown, born on November 25th, 1986, residing at 345 Cedar St, Smalltown, TX 67890. My Policy Number is DEF5678.\nAgent: Good afternoon, Andrew. How can I assist you today?\nCustomer: Hello, I need to update my contact information on my home insurance policy.\nCustomer: I recently moved, and I need to provide my new address and phone number.\nAgent: I can assist you with that. Let me update your contact information in our system.\nAgent: Your contact information has been updated. Is there anything else I can help you with?\nCustomer: That's all for now. Thank you for your help.\nAgent: You're welcome, Andrew. If you have any further questions or need assistance in the future, feel free to reach out.",
"summary": "A customer requests to update their contact information on their home insurance policy due to a recent move. The agent assists by updating the customer's address and phone number in the system, and the customer expresses gratitude for the help provided."
},
{
"conversation_id": 130,
"customer_name": "Michelle Evans",
"agent_name": "Jacob Clark",
"policy_number": "MNO7890",
"conversation": "Customer: Hi, I'm Michelle Evans, DOB is June 15th, 1979, and I live at 567 Elm St, Cityville, NY 23456. My Policy Number is PQR9012.\nAgent: Good morning, Michelle. How can I assist you today?\nCustomer: Hello, I need to cancel my home insurance policy.\nCustomer: I'm selling my house, so I no longer need coverage.\nAgent: I can assist you with that. Let me process the cancellation for you.\nAgent: Your home insurance policy has been cancelled, effective immediately. Is there anything else I can help you with?\nCustomer: That's all, thank you for your help.\nAgent: You're welcome, Michelle. If you have any further questions or need assistance in the future, feel free to reach out.",
"summary": "A customer requests to cancel their home insurance policy as they are selling their house and no longer require coverage. The agent assists by processing the cancellation, and the customer expresses gratitude for the help provided."
},
{
"conversation_id": 131,
"customer_name": "David Garcia",
"agent_name": "Emma Moore",
"policy_number": "RST9012",
"conversation": "Customer: Hi, I'm David Garcia, born on August 8th, 1988, residing at 789 Maple St, Suburbia, CA 34567. My Policy Number is UVW1234.\nAgent: Good morning, David. How can I assist you today?\nCustomer: Hello, I need to inquire about adding a home office coverage to my policy.\nCustomer: I recently started working from home and have valuable equipment that I want to protect.\nAgent: I understand. Let me check your policy to see what options are available.\nAgent: It appears that we offer a home business coverage option that may suit your needs.\nCustomer: That sounds perfect. Please add it to my policy.\nAgent: Your policy has been updated to include home business coverage. Is there anything else I can help you with?\nCustomer: That's all for now. Thank you for your assistance.\nAgent: You're welcome, David. If you have any further questions or need assistance in the future, feel free to reach out.",
"summary": "A customer requests to add home office coverage to their policy as they recently started working from home and want to protect valuable equipment. The agent confirms the availability of a home business coverage option and assists by adding it to the policy, resulting in the customer expressing gratitude for the help provided."
},
{
"conversation_id": 132,
"customer_name": "Sarah Hernandez",
"agent_name": "John Lee",
"policy_number": "LMN3456",
"conversation": "Customer: Hi there, I'm Sarah Hernandez, born on January 12th, 1983, residing at 123 Cedar St, Hilltown, TX 12345. My Policy Number is GHI6789.\nAgent: Good afternoon, Sarah. How can I assist you today?\nCustomer: Hello, I recently got a pet dog and wanted to know if it affects my home insurance policy.\nCustomer: I heard that some breeds are considered high-risk and may affect coverage.\nAgent: Let me check your policy and see how pets are addressed.\nAgent: According to your policy, owning a dog may affect your liability coverage.\nCustomer: What do I need to do to ensure my coverage remains intact?\nAgent: You may need to disclose the breed and any history of aggression to your insurance company.\nCustomer: I'll do that. Thank you for your help.\nAgent: You're welcome, Sarah. If you have any further questions or need assistance in the future, feel free to reach out.",
"summary": "A customer inquires about the impact of getting a pet dog on their home insurance policy, concerned about potential breed-related issues. The agent checks the policy and explains that owning a dog may affect liability coverage, advising the customer to disclose breed information and any history of aggression to the insurance company to ensure coverage remains intact, resulting in the customer expressing gratitude for the assistance provided."
},
{
"conversation_id": 133,
"customer_name": "Christopher Martinez",
"agent_name": "Olivia Taylor",
"policy_number": "OPQ4567",
"conversation": "Customer: Hi, I'm Christopher Martinez, DOB is April 5th, 1980, and I live at 456 Walnut St, Smalltown, NY 89012. My Policy Number is JKL7890.\nAgent: Good morning, Christopher. How can I assist you today?\nCustomer: Hello, I need to renew my home insurance policy.\nCustomer: My policy is expiring soon, and I want to ensure continuous coverage.\nAgent: Let me check your policy renewal options and provide you with the necessary information.\nAgent: Your policy renewal options have been reviewed, and I can assist you with the renewal process.\nCustomer: That's great. Please proceed with the renewal.\nAgent: Your policy has been successfully renewed. Is there anything else I can help you with?\nCustomer: That's all for now. Thank you for your assistance.\nAgent: You're welcome, Christopher. If you have any further questions or need assistance in the future, feel free to reach out.",
"summary": "A customer requests to renew their home insurance policy as it is expiring soon, seeking continuous coverage. The agent reviews renewal options, assists with the renewal process, and confirms successful renewal, resulting in the customer expressing gratitude for the assistance provided."
},
{
"conversation_id": 134,
"customer_name": "Amy Thompson",
"agent_name": "William Davis",
"policy_number": "CDE7890",
"conversation": "Customer: Hi, I'm Amy Thompson, born on October 18th, 1984, residing at 789 Birch St, Suburbia, CA 23456. My Policy Number is EFG1234.\nAgent: Good afternoon, Amy. How can I assist you today?\nCustomer: Hello, I need to report a claim for damage to my home.\nCustomer: There was a fire in my kitchen, and there's significant damage.\nAgent: I'm sorry to hear about the fire. Let me assist you with filing a claim.\nAgent: Your claim has been initiated, and an adjuster will contact you shortly for further assistance.\nCustomer: Thank you for your help.\nAgent: You're welcome, Amy. If you have any further questions or need assistance in the future, feel free to reach out.",
"summary": "A customer reports a claim for damage to their home due to a fire in the kitchen, seeking assistance with the claims process. The agent initiates the claim and assures the customer that an adjuster will contact them shortly for further assistance, resulting in the customer expressing gratitude for the help provided."
},
{
"conversation_id": 135,
"customer_name": "Linda Wilson",
"agent_name": "Michael Brown",
"policy_number": "FGH9012",
"conversation": "Customer: Hi, I'm Linda Wilson, born on June 25th, 1975, residing at 234 Pine St, Cityville, TX 56789. My Policy Number is IJK2345.\nAgent: Good morning, Linda. How can I assist you today?\nCustomer: Hello, I'm extremely disappointed with the service I've received from your company.\nCustomer: I filed a claim for water damage a month ago, and I still haven't received any updates.\nAgent: I apologize for the delay in processing your claim, Linda. Let me investigate the status for you.\nAgent: It appears that there was an oversight in processing your claim. I will expedite the review process and provide you with an update shortly.\nCustomer: This is unacceptable. I expect better service from my insurance provider.\nAgent: I completely understand your frustration, Linda. Rest assured, I will do everything in my power to resolve this matter promptly.\nCustomer: I hope so. I've been a loyal customer for years, and this experience has been disappointing.\nAgent: I sincerely apologize for the inconvenience, Linda. I'll keep you updated on the progress of your claim.\nCustomer: Thank you.",
"summary": "A customer expresses extreme disappointment with the service received from the company, citing a delay in processing a claim for water damage filed a month ago. The agent acknowledges the oversight, apologizes for the inconvenience, and assures the customer of expedited review and updates on the claim's progress, with the customer expressing hope for a resolution and gratitude for the attention to the matter."
},
{
"conversation_id": 136,
"customer_name": "Brian Adams",
"agent_name": "Jessica Miller",
"policy_number": "KLM3456",
"conversation": "Customer: Hi, I'm Brian Adams, DOB is December 10th, 1982, and I live at 345 Oak St, Hilltown, CA 78901. My Policy Number is NOP4567.\nAgent: Good afternoon, Brian. How can I assist you today?\nCustomer: Hello, I'm beyond frustrated with your company's billing practices.\nCustomer: I received a notice stating that my premium has increased significantly without any explanation.\nAgent: I apologize for the inconvenience, Brian. Let me review your policy to understand the reason for the increase.\nAgent: It appears that there was an error in the calculation of your premium. I will escalate this issue to our billing department and ensure it's rectified immediately.\nCustomer: This is unacceptable. I expect transparency and fairness from my insurance provider.\nAgent: I completely understand your frustration, Brian. Rest assured, I will personally oversee the resolution of this matter and keep you updated on the progress.\nCustomer: I appreciate your assistance, but this shouldn't have happened in the first place.\nAgent: I apologize once again, Brian. I'll ensure that corrective measures are put in place to prevent similar issues in the future.\nCustomer: I hope so.",
"summary": "A customer expresses frustration with the company's billing practices, citing a significant increase in premiums without explanation. The agent apologizes for the inconvenience, acknowledges the error in premium calculation, and assures the customer of immediate escalation and resolution, with the customer emphasizing the expectation of transparency and fairness from their insurance provider and the agent expressing commitment to preventive measures to avoid similar issues in the future."
},
{
"conversation_id": 137,
"customer_name": "Karen Garcia",
"agent_name": "Richard Martinez",
"policy_number": "QRS5678",
"conversation": "Customer: Hi, I'm Karen Garcia, born on September 5th, 1979, residing at 456 Cedar St, Smalltown, NY 34567. My Policy Number is TUV6789.\nAgent: Good morning, Karen. How can I assist you today?\nCustomer: Hello, I'm extremely dissatisfied with your company's claims handling process.\nCustomer: I filed a claim for roof damage three weeks ago, and there's been no progress or communication since then.\nAgent: I apologize for the lack of updates, Karen. Let me investigate the status of your claim and provide you with an update.\nAgent: It appears that there was a delay in processing your claim due to a backlog. I will expedite the review process and ensure you receive a timely resolution.\nCustomer: This is unacceptable. I've been left in the dark for too long, and it's causing me a lot of stress.\nAgent: I understand your frustration, Karen. Rest assured, I will personally oversee the handling of your claim and keep you informed every step of the way.\nCustomer: I expect better from my insurance provider. This level of service is unacceptable.\nAgent: I apologize for the inconvenience, Karen. I'll do everything in my power to address your concerns and ensure a satisfactory outcome.\nCustomer: I hope so.",
"summary": "A customer expresses extreme dissatisfaction with the company's claims handling process, citing a lack of progress and communication regarding a filed claim for roof damage. The agent apologizes for the inconvenience, acknowledges the delay due to a backlog, and assures the customer of expedited review and personal oversight to ensure timely resolution, with the customer emphasizing the expectation of better service and the agent expressing commitment to addressing concerns and achieving a satisfactory outcome."
},
{
"conversation_id": 138,
"customer_name": "Jason Miller",
"agent_name": "Michelle Harris",
"policy_number": "VWX7890",
"conversation": "Customer: Hi, I'm Jason Miller, DOB is November 15th, 1983, and I live at 567 Elm St, Suburbia, CA 45678. My Policy Number is YZA8901.\nAgent: Good afternoon, Jason. How can I assist you today?\nCustomer: Hello, I'm furious with your company's lack of responsiveness.\nCustomer: I've been trying to contact your claims department for days, but I keep getting transferred and put on hold.\nAgent: I apologize for the inconvenience, Jason. Let me escalate your issue to a supervisor for immediate assistance.\nAgent: A supervisor will contact you shortly to address your concerns and ensure a prompt resolution.\nCustomer: This is unacceptable. I expect better customer service from my insurance provider.\nAgent: I completely understand your frustration, Jason. Rest assured, we will do everything in our power to rectify the situation and regain your trust.\nCustomer: I hope so. This experience has been extremely frustrating and disappointing.\nAgent: I sincerely apologize for the inconvenience, Jason. We value your feedback, and we're committed to improving our service standards.\nCustomer: I appreciate that.",
"summary": "A customer expresses fury over the company's lack of responsiveness, stating difficulties in contacting the claims department despite attempts over several days. The agent apologizes, escalates the issue to a supervisor for immediate assistance, and assures the customer of efforts to rectify the situation and regain trust, with the customer emphasizing the expectation of better customer service and the agent expressing commitment to improvement and appreciation for the feedback."
},
{
"conversation_id": 139,
"customer_name": "Rachel Clark",
"agent_name": "Daniel Wilson",
"policy_number": "BCD1234",
"conversation": "Customer: Hi, I'm Rachel Clark, born on February 20th, 1981, residing at 678 Walnut St, Cityville, TX 89012. My Policy Number is EFG2345.\nAgent: Good morning, Rachel. How can I assist you today?\nCustomer: Hello, I'm extremely disappointed with your company's claims denial decision.\nCustomer: I filed a claim for water damage, and it was denied without any explanation.\nAgent: I apologize for the frustration, Rachel. Let me review the details of your claim and the reason for the denial.\nAgent: It appears that the damage was deemed to be the result of gradual wear and tear, which is not covered under your policy.\nCustomer: This is unacceptable. I've been paying premiums for years, expecting coverage when I need it most.\nAgent: I understand your frustration, Rachel. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough review of my claim and a fair decision. This denial has caused me a lot of stress.\nAgent: I'll ensure that your claim is reevaluated promptly, Rachel. I apologize for any inconvenience this has caused.\nCustomer: Thank you.",
"summary": "A customer expresses extreme disappointment with the company's claims denial decision regarding water damage, citing lack of explanation. The agent apologizes, reviews the claim details, and explains that the denial was due to damage deemed gradual wear and tear, not covered under the policy. The customer emphasizes the expectation of coverage after years of premium payments, and the agent escalates the concerns for further review, promising a thorough reevaluation and apologizing for any inconvenience caused."
},
{
"conversation_id": 140,
"customer_name": "Emily Rodriguez",
"agent_name": "David Garcia",
"policy_number": "LMN5678",
"conversation": "Customer: Hi, I'm Emily Rodriguez, born on April 8th, 1986, residing at 789 Birch St, Hilltown, CA 23456. My Policy Number is OPQ6789.\nAgent: Good morning, Emily. How can I assist you today?\nCustomer: Hello, I'm extremely frustrated with your company's decision to deny my claim.\nCustomer: I filed a claim for damage caused by a fallen tree, and it was denied without any explanation.\nAgent: I understand your frustration, Emily. Let me review the details of your claim and provide you with an explanation.\nAgent: It appears that the damage was deemed to be the result of an excluded peril, which is not covered under your policy.\nCustomer: This is unacceptable. I've been paying premiums for years, expecting coverage when I need it most.\nAgent: I apologize for the inconvenience, Emily. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough review of my claim and a fair decision. This denial has caused me a lot of stress.\nAgent: I'll ensure that your claim is reevaluated promptly, Emily. I apologize for any inconvenience this has caused.\nCustomer: Thank you.",
"summary": "Customer expresses frustration with claim denial for tree damage, demands explanation. Agent apologizes, cites damage as excluded peril, promises review. Customer stresses expectation of coverage, agent escalates concerns for thorough reevaluation, apologizes for inconvenience."
},
{
"conversation_id": 141,
"customer_name": "Matthew Lopez",
"agent_name": "Emma Wilson",
"policy_number": "RST7890",
"conversation": "Customer: Hi, I'm Matthew Lopez, DOB is October 12th, 1984, and I live at 456 Cedar St, Smalltown, NY 34567. My Policy Number is TUV8901.\nAgent: Good afternoon, Matthew. How can I assist you today?\nCustomer: Hello, I'm extremely disappointed with your company's decision to deny my claim.\nCustomer: I filed a claim for water damage, and it was denied without any explanation.\nAgent: I understand your frustration, Matthew. Let me review the details of your claim and provide you with an explanation.\nAgent: It appears that the damage was deemed to be the result of a maintenance issue, which is not covered under your policy.\nCustomer: This is unacceptable. I've been paying premiums for years, expecting coverage when I need it most.\nAgent: I apologize for the inconvenience, Matthew. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough review of my claim and a fair decision. This denial has caused me a lot of stress.\nAgent: I'll ensure that your claim is reevaluated promptly, Matthew. I apologize for any inconvenience this has caused.\nCustomer: Thank you.",
"summary": "Customer expresses disappointment with claim denial for water damage, demands explanation. Agent apologizes, cites damage as maintenance issue, promises review. Customer stresses expectation of coverage, agent escalates concerns for thorough reevaluation, apologizes for inconvenience."
},
{
"conversation_id": 142,
"customer_name": "Amanda Thompson",
"agent_name": "Michael Johnson",
"policy_number": "UVW9012",
"conversation": "Customer: Hi, I'm Amanda Thompson, born on March 15th, 1983, residing at 567 Oak St, Suburbia, CA 67890. My Policy Number is XYZ0123.\nAgent: Good morning, Amanda. How can I assist you today?\nCustomer: Hello, I'm extremely frustrated with your company's decision to deny my claim.\nCustomer: I filed a claim for theft of personal belongings, and it was denied without any explanation.\nAgent: I understand your frustration, Amanda. Let me review the details of your claim and provide you with an explanation.\nAgent: It appears that the theft was deemed to be the result of negligence, which is not covered under your policy.\nCustomer: This is unacceptable. I've been paying premiums for years, expecting coverage when I need it most.\nAgent: I apologize for the inconvenience, Amanda. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough review of my claim and a fair decision. This denial has caused me a lot of stress.\nAgent: I'll ensure that your claim is reevaluated promptly, Amanda. I apologize for any inconvenience this has caused.\nCustomer: Thank you.",
"summary": "Customer frustrated by claim denial for theft of personal belongings, seeks explanation. Agent apologizes, attributes theft to negligence, promises review. Customer emphasizes expectation of coverage, agent escalates concerns for thorough reevaluation, apologizes for inconvenience."
},
{
"conversation_id": 143,
"customer_name": "Jennifer Lee",
"agent_name": "Olivia Brown",
"policy_number": "CDE2345",
"conversation": "Customer: Hi, I'm Jennifer Lee, born on August 20th, 1980, residing at 678 Pine St, Cityville, TX 45678. My Policy Number is EFG3456.\nAgent: Good afternoon, Jennifer. How can I assist you today?\nCustomer: Hello, I'm extremely disappointed with your company's decision to deny my claim.\nCustomer: I filed a claim for fire damage, and it was denied without any explanation.\nAgent: I understand your frustration, Jennifer. Let me review the details of your claim and provide you with an explanation.\nAgent: It appears that the fire was deemed to be the result of arson, which is not covered under your policy.\nCustomer: This is unacceptable. I've been paying premiums for years, expecting coverage when I need it most.\nAgent: I apologize for the inconvenience, Jennifer. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough review of my claim and a fair decision. This denial has caused me a lot of stress.\nAgent: I'll ensure that your claim is reevaluated promptly, Jennifer. I apologize for any inconvenience this has caused.\nCustomer: Thank you.",
"summary": "Customer frustrated by claim denial for fire damage, seeks explanation. Agent attributes fire to arson, not covered under policy. Customer emphasizes expectation of coverage, agent escalates concerns for thorough reevaluation, apologizes for inconvenience."
},
{
"conversation_id": 140,
"customer_name": "Emily White",
"agent_name": "Andrew Thompson",
"policy_number": "EFG2345",
"conversation": "Customer: Hi, I'm Emily White, born on July 10th, 1980, residing at 789 Pine St, Hilltown, CA 56789. My Policy Number is HIJ3456.\nAgent: Good morning, Emily. How can I assist you today?\nCustomer: Hello, I'm extremely disappointed with your company's decision to deny my claim.\nCustomer: I filed a claim for water damage, but it was denied due to 'lack of timely notification.'\nAgent: I apologize for the inconvenience, Emily. Let me review the details of your claim denial.\nAgent: It appears that the damage occurred several weeks ago, and our policy requires claims to be reported within 72 hours.\nCustomer: This is ridiculous. I wasn't aware of the damage until recently, and I promptly filed the claim.\nAgent: I understand your frustration, Emily. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a fair evaluation of my claim. This denial has caused me a lot of stress and financial burden.\nAgent: I'll ensure that your claim is reevaluated promptly, Emily. I apologize for any inconvenience this has caused.\nCustomer: Thank you.",
"summary": "Customer Emily White disappointed by claim denial for water damage due to 'lack of timely notification'. Agent attributes denial to damage reported beyond policy's 72-hour limit. Customer expresses frustration and financial burden. Agent apologizes and promises prompt reevaluation of the claim."
},
{
"conversation_id": 141,
"customer_name": "James Rodriguez",
"agent_name": "Sophia Martinez",
"policy_number": "KLM4567",
"conversation": "Customer: Hi, I'm James Rodriguez, DOB is March 15th, 1977, and I live at 456 Cedar St, Smalltown, TX 67890. My Policy Number is NOP5678.\nAgent: Good afternoon, James. How can I assist you today?\nCustomer: Hello, I'm extremely frustrated with your company's decision to deny my claim.\nCustomer: I filed a claim for hail damage to my roof, but it was denied due to 'pre-existing damage.'\nAgent: I apologize for the inconvenience, James. Let me review the details of your claim denial.\nAgent: It appears that there was evidence of prior damage to your roof, which was not covered under your policy.\nCustomer: This is outrageous. I had no knowledge of any pre-existing damage, and I've been paying premiums for years.\nAgent: I understand your frustration, James. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough investigation of my claim and a fair decision. This denial has caused me significant financial hardship.\nAgent: I'll ensure that your claim is reevaluated promptly, James. I apologize for any inconvenience this has caused.\nCustomer: Thank you.",
"summary": "Customer James Rodriguez frustrated by claim denial for hail damage due to 'pre-existing damage'. Agent attributes denial to evidence of prior damage not covered by policy. Customer expresses outrage and financial hardship. Agent promises prompt reevaluation of the claim."
},
{
"conversation_id": 141,
"customer_name": "James Rodriguez",
"agent_name": "Sophia Martinez",
"policy_number": "KLM4567",
"conversation": "Customer: Hi, I'm James Rodriguez, DOB is March 15th, 1977, and I live at 456 Cedar St, Smalltown, TX 67890. My Policy Number is NOP5678.\nAgent: Good afternoon, James. How can I assist you today?\nCustomer: Hello, I'm extremely frustrated with your company's decision to deny my claim.\nCustomer: I filed a claim for hail damage to my roof, but it was denied due to 'pre-existing damage.'\nAgent: I apologize for the inconvenience, James. Let me review the details of your claim denial.\nAgent: It appears that there was evidence of prior damage to your roof, which was not covered under your policy.\nCustomer: This is outrageous. I had no knowledge of any pre-existing damage, and I've been paying premiums for years.\nAgent: I understand your frustration, James. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough investigation of my claim and a fair decision. This denial has caused me significant financial hardship.\nAgent: I'll ensure that your claim is reevaluated promptly, James. I apologize for any inconvenience this has caused.\nCustomer: Thank you.",
"summary": " Customer disputes claim denial for hail damage, citing lack of awareness of pre-existing damage. Agent apologizes, attributing denial to evidence of prior damage not covered by the policy. Customer insists on thorough review and fair decision. Agent promises escalation for reevaluation."
},
{
"conversation_id": 142,
"customer_name": "Melissa Thompson",
"agent_name": "David Wilson",
"policy_number": "PQR5678",
"conversation": "Customer: Hi, I'm Melissa Thompson, born on December 5th, 1979, residing at 678 Elm St, Suburbia, NY 90123. My Policy Number is STU6789.\nAgent: Good morning, Melissa. How can I assist you today?\nCustomer: Hello, I'm extremely disappointed with your company's decision to deny my claim.\nCustomer: I filed a claim for fire damage to my garage, but it was denied due to 'policy exclusions.'\nAgent: I apologize for the inconvenience, Melissa. Let me review the details of your claim denial.\nAgent: It appears that damage caused by arson is specifically excluded from coverage under your policy.\nCustomer: This is infuriating. The fire was accidental, and I had nothing to do with it.\nAgent: I understand your frustration, Melissa. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a fair evaluation of my claim. This denial has caused me a lot of stress and financial hardship.\nAgent: I'll ensure that your claim is reevaluated promptly, Melissa. I apologize for any inconvenience this has caused.\nCustomer: Thank you.",
"summary": "Customer disputes claim denial for fire damage, claiming it was accidental. Agent apologizes and explains policy exclusion for damage caused by arson. Customer insists on fair evaluation and expresses stress and financial hardship. Agent promises prompt reevaluation of the claim."
},
{
"conversation_id": 143,
"customer_name": "Steven Lee",
"agent_name": "Emma Moore",
"policy_number": "UVW6789",
"conversation": "Customer: Hi, I'm Steven Lee, DOB is August 20th, 1985, and I live at 789 Oak St, Cityville, CA 23456. My Policy Number is XYZ7890.\nAgent: Good afternoon, Steven. How can I assist you today?\nCustomer: Hello, I'm extremely frustrated with your company's decision to deny my claim.\nCustomer: I filed a claim for theft of personal belongings, but it was denied due to 'lack of evidence.'\nAgent: I apologize for the inconvenience, Steven. Let me review the details of your claim denial.\nAgent: It appears that there was insufficient evidence to support the claim of theft.\nCustomer: This is unacceptable. My belongings were stolen, and I provided all the necessary documentation.\nAgent: I understand your frustration, Steven. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a thorough investigation of my claim and a fair decision. This denial has caused me significant financial loss.\nAgent: I'll ensure that your claim is reevaluated promptly, Steven. I apologize for any inconvenience this has caused.\nCustomer: Thank you.",
"summary": "Customer disputes claim denial for theft of personal belongings due to lack of evidence. Agent apologizes and explains insufficient evidence for the claim. Customer insists on fair investigation and expresses financial loss. Agent promises prompt reevaluation of the claim."
},
{
"conversation_id": 144,
"customer_name": "Nicole Brown",
"agent_name": "John Davis",
"policy_number": "LMN6789",
"conversation": "Customer: Hi, I'm Nicole Brown, born on May 30th, 1983, residing at 123 Maple St, Suburbia, TX 45678. My Policy Number is ABC2345.\nAgent: Good morning, Nicole. How can I assist you today?\nCustomer: Hello, I'm extremely disappointed with your company's decision to deny my claim.\nCustomer: I filed a claim for storm damage to my fence, but it was denied due to 'acts of nature exclusion.'\nAgent: I apologize for the inconvenience, Nicole. Let me review the details of your claim denial.\nAgent: It appears that damage caused by storms, including wind and hail, is specifically excluded from coverage under your policy.\nCustomer: This is frustrating. I thought I was protected against such events.\nAgent: I understand your frustration, Nicole. I'll escalate your concerns to our claims department for further review.\nCustomer: I expect a fair evaluation of my claim. This denial has caused me a lot of stress and financial burden.\nAgent: I'll ensure that your claim is reevaluated promptly, Nicole. I apologize for any inconvenience this has caused.\nCustomer: Thank you.",
"summary": "Customer's claim for storm damage to her fence is denied due to \"acts of nature exclusion.\" Agent apologizes and explains the policy's exclusion. Customer expresses frustration and financial burden. Agent promises a prompt reevaluation of the claim."
}
]
================================================
FILE: Code Examples/GenAI-RAG/cvpipeline.py
================================================
# 2024-11-25
# Andreas Kretz
# This code currently doesn't work because the preparation of the text for ElasticSearch doesn't work
# Try to fix this and write the data
import json, os # Importing JSON for handling JSON data and os for interacting with the operating system
import fitz # PyMuPDF
from llama_index.core import Document, Settings # Importing Document class and Settings for managing LlamaIndex
from llama_index.core.node_parser import SentenceSplitter # Importing SentenceSplitter to split text into smaller chunks
from llama_index.core.ingestion import IngestionPipeline # Importing IngestionPipeline for managing data ingestion
from llama_index.embeddings.ollama import OllamaEmbedding # Importing OllamaEmbedding for generating text embeddings
from llama_index.vector_stores.elasticsearch import ElasticsearchStore # Importing ElasticsearchStore for vector storage
from dotenv import load_dotenv # Importing load_dotenv to load environment variables from a .env file
from llama_index.core import VectorStoreIndex, QueryBundle, Response, Settings
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from index_raw import es_vector_store
from ollama import chat
from ollama import ChatResponse
# extract text form the pdf with PyMuPDF
def extract_text_from_pdf(path):
doc = fitz.open(path)
text = ""
for page_num in range(len(doc)):
page = doc.load_page(page_num)
page_text = page.get_text()
text += page_text
print(text)
return text
# feed the pdf into mistral and get a JSON back
# this fails currently because I cannot get a good answer from mistral. the problem is with escaping \n and '.
def prepare_text_to_json(text_to_summarize):
instruction_template = "Here's a text. Encapsulate it into a json as a string and don't turn it into json attributes. Keep it flat. The attribute where the text should go into is called text. Create another attribute of the json called name and put the name of the person there:"
response: ChatResponse = chat(model='mistral', messages=[
{
'role': 'user',
'content': instruction_template + text_to_summarize,
},
])
print(".....Prepared this json.....\n")
print(response['message']['content'])
return response['message']['content']
# Define an Elasticsearch vector store with configuration for local Elasticsearch
es_vector_store = ElasticsearchStore(
index_name="student_cvs", # Name of the Elasticsearch index
vector_field='conversation_vector', # Field to store the vector representation of the text
text_field='conversation', # Field to store the original text
es_url="http://localhost:9200" # URL of the local Elasticsearch instance
)
local_llm = Ollama(model="mistral")
def main():
ollama_embedding = OllamaEmbedding("mistral") # Initialize the embedding model for generating embeddings using the "mistral" model
# Set up an ingestion pipeline with transformations and the Elasticsearch vector store
pipeline = IngestionPipeline(
transformations=[
SentenceSplitter(chunk_size=350, chunk_overlap=50), # Split text into chunks of size 350 with 50 characters of overlap
ollama_embedding, # Use the embedding model to generate embeddings for the chunks
],
vector_store=es_vector_store # Use the configured Elasticsearch vector store
)
extracted = extract_text_from_pdf('Liam_McGivney_CV.pdf') #extract the text from the CV
prepped_json = prepare_text_to_json(extracted) # prepare the json
#create a document (I think this is wrong right now)
documents = Document(text=prepped_json['text'], metadata={"name": prepped_json['name']})
#documents = [Document(text=item['text']) for entry in prepped_json]
#documents = [Document(text=item['text'], metadata={"name": item['name']}) for item in prepped_json]
pipeline.run(documents=documents) # Run the pipeline to process documents and store embeddings in Elasticsearch
print(".....Done running pipeline.....\n") # Print a completion message
# Entry point of the script
if __name__ == "__main__":
main() # Call the main function
================================================
FILE: Code Examples/GenAI-RAG/docker-compose.yml
================================================
services:
# Elasticsearch Docker Images: https://www.docker.elastic.co/
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:8.16.0
container_name: elasticsearch
environment:
- xpack.security.enabled=false
- discovery.type=single-node
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 65536
hard: 65536
cap_add:
- IPC_LOCK
volumes:
- elasticsearch-data17:/usr/share/elasticsearch/data
ports:
- 9200:9200
- 9300:9300
kibana:
container_name: kibana
image: docker.elastic.co/kibana/kibana:8.16.0
environment:
- ELASTICSEARCH_HOSTS=http://elasticsearch:9200
ports:
- 5601:5601
depends_on:
- elasticsearch
volumes:
elasticsearch-data17:
driver: local
================================================
FILE: Code Examples/GenAI-RAG/index.py
================================================
import json, os # Importing JSON for handling JSON data and os for interacting with the operating system
from llama_index.core import Document, Settings # Importing Document class and Settings for managing LlamaIndex
from llama_index.core.node_parser import SentenceSplitter # Importing SentenceSplitter to split text into smaller chunks
from llama_index.core.ingestion import IngestionPipeline # Importing IngestionPipeline for managing data ingestion
from llama_index.embeddings.ollama import OllamaEmbedding # Importing OllamaEmbedding for generating text embeddings
from llama_index.vector_stores.elasticsearch import ElasticsearchStore # Importing ElasticsearchStore for vector storage
from dotenv import load_dotenv # Importing load_dotenv to load environment variables from a .env file
def get_documents_from_file(file):
"""Reads a JSON file and returns a list of Document objects"""
# Open the JSON file in read-text mode
with open(file=file, mode='rt') as f:
conversations_dict = json.loads(f.read()) # Load the file contents into a Python dictionary
# Create a list of Document objects using the 'conversation' field as text
# and 'conversation_id' field as metadata
documents = [Document(text=item['conversation'],
metadata={"conversation_id": item['conversation_id']})
for item in conversations_dict]
return documents # Return the list of Document objects
# Define an Elasticsearch vector store with configuration for local Elasticsearch
es_vector_store = ElasticsearchStore(
index_name="calls", # Name of the Elasticsearch index
vector_field='conversation_vector', # Field to store the vector representation of the text
text_field='conversation', # Field to store the original text
es_url="http://localhost:9200" # URL of the local Elasticsearch instance
)
# Uncomment this if using Elastic Cloud and ensure ELASTIC_CLOUD_ID and ELASTIC_API_KEY are set in the .env file
# Load the .env file contents into environment variables
# This is used to access sensitive information like API keys or credentials
# load_dotenv('.env')
# es_vector_store = ElasticsearchStore(
# index_name="calls", # Name of the Elasticsearch index
# vector_field='conversation_vector', # Field for vector embeddings
# text_field='conversation', # Field for storing original text
# es_cloud_id=os.getenv("ELASTIC_CLOUD_ID"), # Cloud ID from the .env file
# es_api_key=os.getenv("ELASTIC_API_KEY") # API key from the .env file
# )
def main():
ollama_embedding = OllamaEmbedding("mistral") # Initialize the embedding model for generating embeddings using the "mistral" model
# Set up an ingestion pipeline with transformations and the Elasticsearch vector store
pipeline = IngestionPipeline(
transformations=[
SentenceSplitter(chunk_size=350, chunk_overlap=50), # Split text into chunks of size 350 with 50 characters of overlap
ollama_embedding, # Use the embedding model to generate embeddings for the chunks
],
vector_store=es_vector_store # Use the configured Elasticsearch vector store
)
documents = get_documents_from_file(file="conversations.json") # Load data from a JSON file and convert it to a list of Document objects
pipeline.run(documents=documents) # Run the pipeline to process documents and store embeddings in Elasticsearch
print(".....Done running pipeline.....\n") # Print a completion message
# Entry point of the script
if __name__ == "__main__":
main() # Call the main function
================================================
FILE: Code Examples/GenAI-RAG/query.py
================================================
# query.py
from llama_index.core import VectorStoreIndex, QueryBundle, Response, Settings
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from index_raw import es_vector_store
# Local LLM to send user query to
local_llm = Ollama(model="mistral") # Initialize a local language model (LLM) using the "mistral" model from Ollama
Settings.embed_model= OllamaEmbedding("mistral") # Create a VectorStoreIndex from the existing Elasticsearch vector store
index = VectorStoreIndex.from_vector_store(es_vector_store) # Create a VectorStoreIndex from the existing Elasticsearch vector store
query_engine = index.as_query_engine(local_llm, similarity_top_k=10) # Create a query engine from the index using the local LLM and set top-k similarity results to 10
# Define the query string for the question you want to ask the system you'll see that it has some problems understanding the context
# Especially how to find the policy number from the person's name.
#query="Give me summary of water related issues"
#query="What policy number does emily green, born April 10th, 1988 have?"
#query="Who has the policy number DEF4567"
#query="What information about the person do you need to determin the policy number?"
query="What policy number does emily green, living in 101 Pine St, Boston, MA 02101 have?"
# Create a QueryBundle object, which packages the query and its embedding
# The embedding is generated using the configured embedding model in Settings
bundle = QueryBundle(query, embedding=Settings.embed_model.get_query_embedding(query))
# Use the query engine to execute the query bundle against the vector store
# and retrieve the most relevant results
result = query_engine.query(bundle)
# Print the results of the query to the console
print(result)
================================================
FILE: Code Examples/Movies.txt
================================================
Year;Length;Title;Subject;Actor;Actress;Director;Popularity;Awards;*Image
INT;INT;STRING;CAT;CAT;CAT;CAT;INT;BOOL;STRING
1990;111;Tie Me Up! Tie Me Down!;Comedy;Banderas, Antonio;Abril, Victoria;Almodóvar, Pedro;68;No;NicholasCage.png
1991;113;High Heels;Comedy;Bosé, Miguel;Abril, Victoria;Almodóvar, Pedro;68;No;NicholasCage.png
1983;104;Dead Zone, The;Horror;Walken, Christopher;Adams, Brooke;Cronenberg, David;79;No;NicholasCage.png
1979;122;Cuba;Action;Connery, Sean;Adams, Brooke;Lester, Richard;6;No;seanConnery.png
1978;94;Days of Heaven;Drama;Gere, Richard;Adams, Brooke;Malick, Terrence;14;No;NicholasCage.png
1983;140;Octopussy;Action;Moore, Roger;Adams, Maud;Glen, John;68;No;NicholasCage.png
1984;101;Target Eagle;Action;Connors, Chuck;Adams, Maud;Loma, José Antonio de la;14;No;NicholasCage.png
1989;99;American Angels: Baptism of Blood, The;Drama;Bergen, Robert D.;Adams, Trudy;Sebastian, Beverly;28;No;NicholasCage.png
1985;104;Subway;Drama;Lambert, Christopher;Adjani, Isabelle;Besson, Luc;6;No;NicholasCage.png
1990;149;Camille Claudel;Drama;Depardieu, Gérard;Adjani, Isabelle;Nuytten, Bruno;32;No;NicholasCage.png
1982;188;Fanny and Alexander;Drama;Ahlstedt, Börje;Adolphson, Kristina;Bergman, Ingmar;81;Yes;Bergman.png
1982;117;Tragedy of a Ridiculous Man;Drama;Tognazzi, Ugo;Aimee, Anouk;Bertolucci, Bernardo;17;No;NicholasCage.png
1966;103;A Man & a Woman;Drama;Trintignant, Jean-Louis;Aimee, Anouk;Lelouch, Claude;46;Yes;NicholasCage.png
1986;112;A Man & a Woman: Twenty Years Later;Drama;Trintignant, Jean-Louis;Aimee, Anouk;Lelouch, Claude;49;No;NicholasCage.png
1966;103;Un Hombre y una Mujer;Drama;Trintignant, Jean-Louis;Aimee, Anouk;Lelouch, Claude;6;Yes;NicholasCage.png
1985;112;Official Story, The;Drama;Alterio, Hector;Aleandro, Norma;Puenzo, Luiz;39;Yes;NicholasCage.png
1976;150;Lindbergh Kidnapping Case, The;Drama;Hopkins, Anthony;Alexander, Denise;Kulik, Buzz;51;No;AnthonyHopkins.png
1929;84;Blackmail;Mystery;Longden, John;Algood, Sara;Hitchcock, Alfred;2;No;alfredHitchcock.png
1963;109;Donovan's Reef;Comedy;Wayne, John;Allen, Elizabeth;Ford, John;62;No;johnWayne.png
1988;110;Tucker: The Man & His Dream;Drama;Bridges, Jeff;Allen, Joan;Coppola, Francis Ford;68;No;NicholasCage.png
1988;101;Scrooged;Comedy;Murray, Bill;Allen, Karen;Donner, Richard;15;No;NicholasCage.png
1981;116;Raiders of the Lost Ark;Action;Ford, Harrison;Allen, Karen;Spielberg, Steven;8;No;NicholasCage.png
1987;101;Running Man, The;Science Fiction;Schwarzenegger, Arnold;Alonso, Maria Conchita;Glaser, Paul Michael;31;No;NicholasCage.png
1991;105;Predator 2;Action;Glover, Danny;Alonso, Maria Conchita;Hopkins, Stephen;79;No;NicholasCage.png
1988;127;Colors;Drama;Penn, Sean;Alonso, Maria Conchita;Hopper, Dennis;23;No;NicholasCage.png
1990;97;Zandalee;Drama;Cage, Nicolas;Anderson, Erika;Pillsbury, Sam;80;No;NicholasCage.png
1988;108;Miles from Home;Drama;Anderson, Kevin;Anderson, Jo;Sinise, Gary;53;No;NicholasCage.png
1980;;Happy Birthday to Me;Horror;Ford, Glenn;Anderson, Melissa Sue;Thompson, J. Lee;88;No;glennFord.png
1989;88;Final Notice;Mystery;Gerard, Gil;Anderson, Melody;Stern, Steven Hilliard;88;No;NicholasCage.png
1979;110;Quintet;Drama;Newman, Paul;Andersson, Bibi;Altman, Robert;19;No;paulNewman.png
1960;90;Devil's Eye, The;Drama;Kulle, Jarl;Andersson, Bibi;Bergman, Ingmar;20;No;Bergman.png
1957;91;Wild Strawberries;Drama;Sjöström, Victor;Andersson, Bibi;Bergman, Ingmar;42;Yes;Bergman.png
1956;96;Seventh Seal, The;Drama;Sydow, Max von;Andersson, Bibi;Bergman, Ingmar;62;No;Bergman.png
1992;90;Germicide;Drama;Taylor, Rod;Andersson, Bibi;;36;No;NicholasCage.png
1955;86;Dreams;Drama;Björnstrand, Gunnar;Andersson, Harriet;Bergman, Ingmar;14;No;Bergman.png
1955;95;Naked Night, The;Drama;Björnstrand, Gunnar;Andersson, Harriet;Bergman, Ingmar;38;No;Bergman.png
1962;91;Through a Glass Darkly;Drama;Björnstrand, Gunnar;Andersson, Harriet;Bergman, Ingmar;64;Yes;Bergman.png
1972;91;Cries & Whispers;Drama;Josephson, Erland;Andersson, Harriet;Bergman, Ingmar;18;Yes;Bergman.png
1958;104;Barbarian & the Geisha, The;Action;Wayne, John;Ando, Eiko;Huston, John;52;No;johnWayne.png
1967;130;Casino Royale;Comedy;Niven, David;Andress, Ursula;Hughes, Ken;11;No;NicholasCage.png
1962;;Dr. No;Action;Connery, Sean;Andress, Ursula;Young, Terence;7;No;seanConnery.png
1954;103;Elephant Walk;Drama;Finch, Peter;Andrews, Dana;;11;No;NicholasCage.png
1979;121;Ten;Comedy;Moore, Dudley;Andrews, Julie;Edwards, Blake;60;No;NicholasCage.png
1983;118;Man Who Loved Women, The;Comedy;Reynolds, Burt;Andrews, Julie;Edwards, Blake;67;No;NicholasCage.png
1966;190;Hawaii;Drama;Sydow, Max von;Andrews, Julie;Hill, George Roy;8;No;NicholasCage.png
1966;125;Torn Curtain;Mystery;Newman, Paul;Andrews, Julie;Hitchcock, Alfred;35;No;paulNewman.png
1986;107;Duet for One;Drama;Bates, Alan;Andrews, Julie;Konchalovsky, Andrei;82;No;NicholasCage.png
1965;172;Sound of Music, The;Music;Plummer, Christopher;Andrews, Julie;Wise, Robert;59;Yes;NicholasCage.png
1985;55;Gonzo Presents Muppet Weird Stuff;Comedy;Cleese, John;Andrews, Julie;;88;No;NicholasCage.png
1984;140;Tartuffe;Comedy;Depardieu, Gérard;Annen, Paule;Depardieu, Gérard;67;No;NicholasCage.png
1988;104;A New Life;Comedy;Alda, Alan;Ann-Margret;Alda, Alan;53;No;NicholasCage.png
1978;106;Magic;Mystery;Hopkins, Anthony;Ann-Margret;Attenborough, Richard;85;No;AnthonyHopkins.png
1992;286;Tommy;Music;Daltry, Roger;Ann-Margret;Russell, Ken;5;No;NicholasCage.png
1978;108;Big Fix, The;Mystery;Dreyfuss, Richard;Anspach, Susan;Kagan, Jeremy Paul;19;No;NicholasCage.png
1992;95;Alan & Naomi;Drama;Haas, Lukas;Aquino, Vanessa;Vanwagenen, Sterling;3;No;NicholasCage.png
1987;120;Fatal Attraction;Mystery;Douglas, Michael;Archer, Anne;Lyne, Adrian;61;No;NicholasCage.png
1992;117;Patriot Games;Action;Ford, Harrison;Archer, Anne;Noyce, Phillip;28;No;NicholasCage.png
1981;106;Woman Next Door, The;Drama;Depardieu, Gérard;Ardant, Fanny;Truffaut, François;82;No;NicholasCage.png
1992;97;Hunting;Mystery;Savage, John;Armstrong, Kerry;Howson, Frank;68;No;NicholasCage.png
1991;115;Bataan;War;Taylor, Robert;Arnaz, Desi;;68;No;NicholasCage.png
1924;110;Siegfried, The Nibelungenlied;Drama;Richter, Paul;Arnold, Gertrud;Lang, Fritz;79;No;NicholasCage.png
1991;90;Henry, Portrait of a Serial Killer;Horror;Rooker, Michael;Arnold, Tracy;;69;No;NicholasCage.png
1988;118;Big Blue, The;Drama;Barr, Jean-Marc;Arquette, Rosanna;Besson, Luc;7;No;NicholasCage.png
1991;115;Flight of the Intruder;Drama;Glover, Danny;Arquette, Rosanna;Milius, John;51;No;NicholasCage.png
1986;108;Nobody's Fool;Comedy;Roberts, Eric;Arquette, Rosanna;Purcell, Evelyn;52;No;NicholasCage.png
1985;97;After Hours;Comedy;Dunne, Griffin;Arquette, Rosanna;Scorsese, Martin;81;No;NicholasCage.png
1985;104;Desperately Seeking Susan;Comedy;Quinn, Aidan;Arquette, Rosanna;Seidelman, Susan;41;No;NicholasCage.png
1971;102;A New Leaf;Comedy;Matthau, Walter;Arrick, Rose;May, Elaine;83;No;NicholasCage.png
1959;91;Killers of Kilimanjaro;Action;Taylor, Robert;Aslan, Gregoire;Thorpe, Richard;11;No;NicholasCage.png
1926;126;Don Juan;Action;Barrymore, John;Astor, Mary;Crosland, Alan;55;No;NicholasCage.png
1987;102;Babette's Feast;Drama;LaFont, Jean-Philippe;Audran, Stéphane;Axel, Gabriel;79;Yes;NicholasCage.png
1989;118;Vincent, Francois, Paul & the Others;Drama;Montand, Yves;Audran, Stéphane;;20;No;NicholasCage.png
1988;141;Thunderball;Action;Connery, Sean;Auger, Claudine;Young, Terrence;8;No;seanConnery.png
1926;66;Lodger (Story of the London Fog);Mystery;Chesney, Arthur;Ault, Marie;Hitchcock, Alfred;76;No;alfredHitchcock.png
1988;103;Appointment with Death;Mystery;Ustinov, Peter;Bacall, Lauren;Donaggio, Michael Winner;75;No;NicholasCage.png
1974;128;Murder on the Orient Express;Mystery;Balsam, Martin;Bacall, Lauren;Lumet, Sidney;8;Yes;NicholasCage.png
1955;115;Blood Alley;War;Wayne, John;Bacall, Lauren;Wellman, William;15;No;johnWayne.png
1977;136;Spy Who Loved Me, The;Action;Moore, Roger;Bach, Barbara;Gilbert, Lewis;27;No;NicholasCage.png
1988;100;Storm;Action;Palfy, David;Bahtia, Stacy Christensen;Winning, David;61;No;NicholasCage.png
1991;89;Bloodbath;Horror;Hopper, Dennis;Baker, Carroll;;37;No;NicholasCage.png
1989;103;Miami Cops;Action;Roundtree, Richard;Baker, Dawn;Bradley, Al;40;No;NicholasCage.png
1996;96;Island of Dr. Moreau, The;Horror;Thewlis, David;Balk, Fairuza;Frankenheimer, John;39;No;NicholasCage.png
1992;100;Eighty-Four Charing Cross Road;Drama;Hopkins, Anthony;Bancroft, Anne;Jones, David;9;No;AnthonyHopkins.png
1980;124;Elephant Man, The;Drama;Hopkins, Anthony;Bancroft, Anne;Lynch, David;3;Yes;AnthonyHopkins.png
1988;90;Dr Alien;Science Fiction;Jacoby, Billy;Barash, Olivia;DeCoteau, David;70;No;NicholasCage.png
1982;120;Creepshow;Horror;Holbrook, Hal;Barbeau, Adrienne;Romero, George A.;70;No;NicholasCage.png
1987;100;Sammy & Rosie Get Laid;Drama;Din, Ayub Khan;Barber, Frances;Frears, Stephen;6;No;NicholasCage.png
1971;101;Goalie's Anxiety at the Penalty Kick, The;Drama;Brauss, Arthur;Bardischewski, Maria;Wenders, Wim;62;No;NicholasCage.png
1957;99;Mademoiselle Striptease;Comedy;Gelin, Daniel;Bardot, Brigitte;Allegret, Marc;25;No;brigitteBardot.png
1969;86;Women, The;Drama;Ronet, Maurice;Bardot, Brigitte;Aurel, Jean;66;No;brigitteBardot.png
1958;77;That Naughty Girl;Comedy;Bretonniere, Jean;Bardot, Brigitte;Boisrond, Michel;37;No;brigitteBardot.png
1959;90;Voulez-Vous Danser Avec Moi?;Comedy;Vidal, Henri;Bardot, Brigitte;Boisrond, Michel;16;No;brigitteBardot.png
1967;100;A Coeur Joie, (Head Over Heels);Action;Terzieff, Laurent;Bardot, Brigitte;Bourguignon, Serge;54;No;brigitteBardot.png
1968;113;Shalako;Westerns;Connery, Sean;Bardot, Brigitte;Dmytryk, Edward;0;No;brigitteBardot.png
1964;102;Contempt;Drama;Palance, Jack;Bardot, Brigitte;Godard, Jean-Luc;81;No;brigitteBardot.png
1965;100;Dear Brigitte;Comedy;Mumy, Billy;Bardot, Brigitte;Koster, Henry;71;No;brigitteBardot.png
1962;134;A Very Private Affair;Drama;Mastroianni, Marcello;Bardot, Brigitte;Malle, Louis;30;No;brigitteBardot.png
1964;99;Ravishing Idiot, The;Comedy;Perkins, Anthony;Bardot, Brigitte;Molinaro, Edouard;34;No;brigitteBardot.png
1958;90;Bride Is Much Too Beautiful, The;Comedy;Jourdan, Louis;Bardot, Brigitte;Surin, Fred;70;No;brigitteBardot.png
1955;90;Doctor at Sea;Comedy;Bogarde, Dirk;Bardot, Brigitte;Thomas, Ralph;83;No;brigitteBardot.png
1962;100;Le Repos Du Guerrier, (Warrior's Rest);War;Hossein, Robert;Bardot, Brigitte;Vadim, Roger;8;No;brigitteBardot.png
1957;90;And God Created Woman;Drama;Jurgens, Curt;Bardot, Brigitte;Vadim, Roger;29;No;brigitteBardot.png
1973;87;Ms. Don Juan;Drama;Ronet, Maurice;Bardot, Brigitte;Vadim, Roger;39;No;brigitteBardot.png
1987;97;Siesta;Drama;Byrne, Gabriel;Barkin, Ellen;Lambert, Mary;48;No;NicholasCage.png
1932;92;Rich & Strange;Drama;Kendall, Henry;Barry, Joan;Hitchcock, Alfred;57;No;alfredHitchcock.png
1987;104;Lionheart;Action;Stoltz, Eric;Barrymore, Deborah;Schaffner, Franklin J.;9;No;NicholasCage.png
1982;115;E. T. The Extra-Terrestrial;Science Fiction;Wallace, Dee;Barrymore, Drew;Spielberg, Steven;8;Yes;NicholasCage.png
1992;101;Cool World;Drama;Byrne, Gabriel;Basinger, Kim;Bakshi, Ralph;44;No;NicholasCage.png
1988;83;Nadine;Comedy;Bridges, Jeff;Basinger, Kim;Benton, Robert;47;No;NicholasCage.png
1989;126;Batman;Action;Nicholson, Jack;Basinger, Kim;Burton, Tim;14;No;JackNicholson.png
1987;95;Blind Date;Comedy;Willis, Bruce;Basinger, Kim;Edwards, Blake;7;No;NicholasCage.png
1982;101;Mother Lode;Action;Heston, Charlton;Basinger, Kim;Heston, Charlton;40;No;NicholasCage.png
1992;125;Final Analysis;Drama;Gere, Richard;Basinger, Kim;Joanou, Phil;50;No;NicholasCage.png
1983;134;Never Say Never Again;Action;Connery, Sean;Basinger, Kim;Kershner, Irvin;8;No;seanConnery.png
1986;117;Nine & a Half Weeks;Drama;Rourke, Mickey;Basinger, Kim;Lyne, Adrian;7;No;NicholasCage.png
1989;;Killjoy;Mystery;Culp, Robert;Basinger, Kim;Moxey, John Llewellyn;71;No;NicholasCage.png
1986;108;No Mercy;Drama;Gere, Richard;Basinger, Kim;Pearce, Richard;11;No;NicholasCage.png
1991;116;Marrying Man, The;Comedy;Baldwin, Alec;Basinger, Kim;Rees, Jerry;84;No;NicholasCage.png
1990;123;Misery;Horror;Caan, James;Bates, Kathy;Reiner, Rob;48;Yes;NicholasCage.png
1946;93;Crisis;Drama;Andersson, Wiktor;Baude, Anna-Lisa;Bergman, Ingmar;66;No;Bergman.png
1984;95;Samson & Delilah;Drama;Hamilton, Antony;Bauer, Belinda;Philips, Lee;36;No;NicholasCage.png
1990;101;Act of Piracy;Mystery;Busey, Gary;Bauer, Belinda;;74;No;NicholasCage.png
1988;96;Split Decisions;Drama;Hackman, Gene;Beals, Jennifer;Drury, David;52;No;NicholasCage.png
1989;103;Vampire's Kiss;Comedy;Cage, Nicolas;Beals, Jennifer;;49;No;NicholasCage.png
1988;96;Nightmare at Noon;Action;Hauser, Wings;Beck, Kimberly;Mastorakis, Nico;0;No;NicholasCage.png
1990;127;Presumed Innocent;Mystery;Ford, Harrison;Bedelia, Bonnie;Pakula, Alan J.;69;No;NicholasCage.png
1942;123;Reap the Wild Wind;Drama;Wayne, John;Beecher, Janet;DeMille, Cecil B.;59;No;johnWayne.png
1972;100;Pocket Money;Comedy;Newman, Paul;Belford, Christine;Rosenberg, Stuart;55;No;paulNewman.png
1977;102;Mary White;Drama;Flanders, Ed;Beller, Kathleen;Taylor, Jud;2;No;NicholasCage.png
1982;;Catch a Rising Star, Tenth Anniversary;Comedy;Belzer, Richard;Benatar, Pat;;18;No;NicholasCage.png
1990;105;Guilty by Suspicion;Drama;De Niro, Robert;Bening, Annette;Winkler, Irwin;88;No;NicholasCage.png
1948;99;Secret Beyond the Door;Mystery;Redgrave, Michael;Bennett, Joan;Lang, Fritz;31;No;NicholasCage.png
1945;103;Scarlet Street;Drama;Robinson, Edward G.;Bennett, Joan;Lang, Fritz;80;No;NicholasCage.png
1988;76;Daffy Duck's Quackbusters;Action;Blanc, Mel;Bennett, Julie;Ford, Greg;68;No;NicholasCage.png
1985;55;Rowlf's Rhapsodies with the Muppets;Comedy;Burns, George;Berenson, Marisa;;79;No;NicholasCage.png
1982;188;Gandhi;Drama;Kingsley, Ben;Bergen, Candice;Attenborough, Richard;7;Yes;NicholasCage.png
1975;120;Wind & the Lion, The;Action;Connery, Sean;Bergen, Candice;Milius, John;2;No;seanConnery.png
1971;96;Carnal Knowledge;Drama;Nicholson, Jack;Bergen, Candice;Nichols, Mike;10;No;JackNicholson.png
1970;126;Getting Straight;Comedy;Gould, Elliott;Bergen, Candice;Rush, Richard;83;No;NicholasCage.png
1972;90;Scarlet Letter, The;Drama;Albaicín, Rafael;Berger, Senta;Wenders, Wim;55;No;NicholasCage.png
1935;75;Count of Old Town, The;Comedy;Adolphson, Edvin;Bergman, Ingrid;Adolphson, Edvin;72;No;ingridBergman.png
1978;97;Autumn Sonata;Drama;Björk, Halvar;Bergman, Ingrid;Bergman, Ingmar;49;Yes;ingridBergman.png
1944;114;Gaslight;Drama;Boyer, Charles;Bergman, Ingrid;Cukor, George;25;Yes;ingridBergman.png
1958;100;Indiscreet;Drama;Grant, Cary;Bergman, Ingrid;Donen, Stanley;1;No;ingridBergman.png
1941;75;Walpurgis Night;Drama;Sjöström, Victor;Bergman, Ingrid;Edgren, Gustaf;32;No;ingridBergman.png
1948;100;Joan of Arc;Drama;Ferrer, Jose;Bergman, Ingrid;Fleming, Victor;7;No;ingridBergman.png
1982;195;A Woman Called Golda;Drama;Beatty, Ned;Bergman, Ingrid;Gibson, Alan;15;Yes;ingridBergman.png
1969;98;A Walk in the Spring Rain;Drama;Quinn, Anthony;Bergman, Ingrid;Green, Guy;2;No;ingridBergman.png
1949;117;Under Capricorn;Drama;Cotten, Joseph;Bergman, Ingrid;Hitchcock, Alfred;74;No;ingridBergman.png
1946;101;Notorious;Mystery;Grant, Cary;Bergman, Ingrid;Hitchcock, Alfred;42;No;ingridBergman.png
1940;90;June Night;Drama;Widgren, Olof;Bergman, Ingrid;Lindberg, Per;14;No;ingridBergman.png
1961;120;Goodbye Again;Drama;Perkins, Anthony;Bergman, Ingrid;Litvak, Anatole;6;No;ingridBergman.png
1956;106;Anastasia;Drama;Tamiroff, Akim;Bergman, Ingrid;Litvak, Anatole;24;Yes;ingridBergman.png
1945;126;Bells of St. Mary's, The;Drama;Crosby, Bing;Bergman, Ingrid;McCarey, Leo;31;No;ingridBergman.png
1937;91;Intermezzo;Drama;Ekman, Gösta;Bergman, Ingrid;Molander, Gustaf;32;No;ingridBergman.png
1938;104;A Woman's Face;Drama;Svennberg, Tore;Bergman, Ingrid;Molander, Gustaf;49;No;ingridBergman.png
1935;90;Swedenhielms;Drama;Westergren, Håkan;Bergman, Ingrid;Molander, Gustaf;88;No;ingridBergman.png
1939;87;Only One Night;Drama;Adolphson, Edvin;Bergman, Ingrid;Molander, Gustav;26;No;ingridBergman.png
1938;74;Dollar;Drama;Rydeberg, Georg;Bergman, Ingrid;Molander, Gustav;19;No;ingridBergman.png
1956;98;Elena & Her Men;Drama;Ferrer, Mel;Bergman, Ingrid;Renoir, Jean;33;No;ingridBergman.png
1952;110;Europa Fifty-One;Drama;Knox, Alexander;Bergman, Ingrid;Rossellini, Roberto;34;No;ingridBergman.png
1953;83;Voyage in Italy;Drama;Sanders, George;Bergman, Ingrid;Rossellini, Roberto;57;No;ingridBergman.png
1954;81;Fear;Drama;Wieman, Mathias;Bergman, Ingrid;Rossellini, Roberto;69;No;ingridBergman.png
1950;107;Stromboli;Drama;Vitale, Mario;Bergman, Ingrid;Rossellini, Roberto;69;No;ingridBergman.png
1969;103;Cactus Flower;Comedy;Matthau, Walter;Bergman, Ingrid;Saks, Gene;67;Yes;ingridBergman.png
1989;105;Hideaways;Comedy;Conover, Bruce;Bergman, Ingrid;;16;No;ingridBergman.png
1990;90;Twenty Four Hours in a Woman's Life;Drama;Torn, Rip;Bergman, Ingrid;;16;No;ingridBergman.png
1987;91;Programmed to Kill;Action;Ginty, Robert;Bergman, Sandahl;Holzman, Allan;71;No;NicholasCage.png
1982;128;Conan the Barbarian;Action;Schwarzenegger, Arnold;Bergman, Sandahl;Milius, John;45;No;NicholasCage.png
1991;91;Raw Nerve;Mystery;Ford, Glenn;Bergman, Sandahl;Prior, David A.;88;No;glennFord.png
1970;94;Think Dirty;Comedy;Feldman, Marty;Berman, Shelley;Clark, Jim;31;No;NicholasCage.png
1982;108;King of Comedy;Drama;De Niro, Robert;Bernhard, Sandra;Scorsese, Martin;84;No;NicholasCage.png
1983;60;Best of the Big Laff Off, The;Comedy;Murphy, Eddie;Bernhard, Sandra;;20;No;NicholasCage.png
1984;158;Amadeus;Drama;Abraham, F. Murray;Berridge, Elizabeth;Forman, Milos;6;Yes;NicholasCage.png
1973;101;White Lightning;Action;Reynolds, Burt;Billingsley, Jennifer;Sargent, Joseph;54;No;NicholasCage.png
1988;172;Unbearable Lightness of Being, The;Drama;Day-Lewis, Daniel;Binoche, Juliette;Kaufman, Philip;5;Yes;NicholasCage.png
1972;124;Life & Times of Judge Roy Bean, The;Western;Newman, Paul;Bisset, Jacqueline;Huston, John;65;No;paulNewman.png
1970;137;Airport;Drama;Lancaster, Burt;Bisset, Jacqueline;Seaton, George;0;Yes;burtLancaster.png
1973;116;Day for Night;Drama;Aumont, Jean-Pierre;Bisset, Jacqueline;Truffaut, François;10;Yes;NicholasCage.png
1952;107;Secrets of Women;Comedy;Malmsten, Birger;Björk, Anita;Bergman, Ingmar;66;No;Bergman.png
1976;116;Burnt Offerings;Horror;Reed, Oliver;Black, Karen;Curtis, Dan;35;No;NicholasCage.png
1969;94;Easy Rider;Drama;Fonda, Peter;Black, Karen;Hopper, Dennis;36;No;NicholasCage.png
1991;98;Five Easy Pieces;Drama;Nicholson, Jack;Black, Karen;Rafelson, Bob;2;No;JackNicholson.png
1974;144;Day of the Locust, The;Drama;Sutherland, Donald;Black, Karen;Schlesinger, John;81;No;NicholasCage.png
1964;112;Goldfinger;Action;Connery, Sean;Blackman, Honor;Hamilton, Guy;77;No;seanConnery.png
1977;117;Exorcist II, The Heretic;Horror;Burton, Richard;Blair, Linda;Boorman, John;29;No;NicholasCage.png
1953;61;White Lightning;;Clements, Stanley;Blondell, Gloria;Bernds, Edward;;No;NicholasCage.png
1942;88;Lady for a Night;Drama;Wayne, John;Blondell, Joan;Leigh, Jason;12;No;johnWayne.png
1968;103;Charly;Drama;Robertson, Cliff;Bloom, Claire;Nelson, Ralph;38;Yes;NicholasCage.png
1973;105;High Plains Drifter;Western;Eastwood, Clint;Bloom, Verna;Eastwood, Clint;57;No;clintEastwood.png
1982;123;Honkytonk Man;Drama;Eastwood, Clint;Bloom, Verna;Eastwood, Clint;69;No;clintEastwood.png
1990;102;Nightbreed;Horror;Cronenberg, David;Bobby, Anne;Barker, Clive;72;No;NicholasCage.png
1987;98;Under the Sun of Satan;Drama;Depardieu, Gérard;Bonnaire, Sandrine;Pialat, Maurice;45;No;NicholasCage.png
1985;105;Vagabond;Drama;Meril, Macha;Bonnaire, Sandrine;Varda, Agnes;49;No;NicholasCage.png
1993;60;Bill Cosby, Live at Harrah's;Comedy;Cosby, Bill;Boosler, Elayne;;13;No;NicholasCage.png
1974;89;Monty Python & the Holy Grail;Comedy;Chapman, Graham;Booth, Connie;Gilliam, Terry;83;No;NicholasCage.png
1993;65;John Cleese on How to Irritate People;Comedy;Cleese, John;Booth, Connie;;62;No;NicholasCage.png
1958;101;Matchmaker, The;Comedy;Perkins, Anthony;Booth, Shirley;Anthony, Joseph;67;No;NicholasCage.png
1981;129;For Your Eyes Only;Action;Moore, Roger;Bouquet, Carole;Glen, John;86;No;NicholasCage.png
1928;139;Wings;War;Rogers, Buddy;Bow, Clara;Wellman, William;44;Yes;NicholasCage.png
1992;106;Medicine Man;Action;Connery, Sean;Bracco, Lorraine;McTiernan, John;6;No;seanConnery.png
1989;;Good Fellas;Drama;De Niro, Robert;Bracco, Lorraine;Scorsese, Martin;15;No;NicholasCage.png
1985;119;Kiss of the Spider Woman;Drama;Hurt, William;Braga, Sonia;Babenco, Hector;10;Yes;NicholasCage.png
1990;121;Rookie, THe;Action;Eastwood, Clint;Braga, Sonia;Eastwood, Clint;48;No;clintEastwood.png
1973;129;Sting, The;Drama;Newman, Paul;Brennan, Eileen;Hill, George Roy;83;Yes;paulNewman.png
1958;96;Torpedo Run;War;Ford, Glenn;Brewster, Diane;Pevney, Joseph;50;No;glennFord.png
1986;101;Instant Justice;Drama;Paré, Michael;Bridges, Lynda;Rumar, Craig;45;No;NicholasCage.png
1990;135;Cyrano de Bergerac;Drama;Depardieu, Gérard;Brochet, Anne;Rappeneau, Jean-Paul;76;No;NicholasCage.png
1948;110;Border Street;Drama;Fijewski, Tadeusz;Broniewska, Maria;Ford, Aleksander;73;No;NicholasCage.png
1987;91;Firehouse;Comedy;Hopkins, Barrett;Brown, Violet;Ingvordsen, J. Christian;66;No;NicholasCage.png
1965;123;Morituri;Drama;Brando, Marlon;Brynner, Yul;Wicki, Bernhard;9;No;brando.png
1980;104;From the Life of the Marionettes;Drama;Atzorn, Robert;Buchegger, Christine;Bergman, Ingmar;58;No;Bergman.png
1988;120;Frantic;Mystery;Ford, Harrison;Buckley, Betty;Polanski, Roman;17;No;NicholasCage.png
1978;114;Coma;Science Fiction;Douglas, Michael;Bujold, Geneviève;Crichton, Michael;64;No;NicholasCage.png
1988;117;Dead Ringers;Drama;Irons, Jeremy;Bujold, Geneviève;Cronenberg, David;29;No;NicholasCage.png
1988;90;Golden Ninja Invasion;Action;West, Leonard;Burd, Stephanie;Lambert, Bruce;13;No;NicholasCage.png
1973;122;Exorcist, The;Horror;Sydow, Max von;Burstyn, Ellen;Friedkin, William;28;Yes;NicholasCage.png
1975;112;Alice Doesn't Live Here Anymore;Comedy;Kristofferson, Kris;Burstyn, Ellen;;82;Yes;NicholasCage.png
1982;94;Eyes of the Amaryllis, The;Drama;Bolt, Jonathan;Byrne, Martha;King Keller, Frederick;70;No;NicholasCage.png
1952;109;What Price Glory?;War;Cagney, James;Calvet, Corinne;Ford, John;4;No;johnFord.png
1954;40;Inauguration of the Pleasure Dome;Short;De Brier, Sampson;Cameron, Marjorie;Anger, Kenneth;62;No;NicholasCage.png
1989;114;School Daze;Comedy;Fishburne, Larry;Campbell, Tisha;Lee, Spike;18;No;NicholasCage.png
1990;102;End of Innocence, The;Drama;Heard, John;Cannon, Dyan;Cannon, Dyan;6;No;NicholasCage.png
1971;98;Anderson Tapes, The;Mystery;Connery, Sean;Cannon, Dyan;Lumet, Sidney;1;No;seanConnery.png
1983;50;Father Murphy, A Horse from Heaven;Comedy;Olsen, Merlin;Cannon, Katharine;Claxton, William F.;28;No;NicholasCage.png
1989;80;Skull;Drama;Bideman, Robert;Capone, Nadia;Bergman, Robert;19;No;NicholasCage.png
1987;91;Quick & The Dead, The;Western;Elliott, Sam;Capshaw, Kate;Day, Robert;40;No;NicholasCage.png
1984;94;Best Defense;Comedy;Moore, Dudley;Capshaw, Kate;Huyck, Willard;75;No;NicholasCage.png
1984;99;Dreamscape;Science Fiction;Quaid, Dennis;Capshaw, Kate;Ruben, Joseph;63;No;NicholasCage.png
1989;125;Black Rain;Action;Douglas, Michael;Capshaw, Kate;Scott, Ridley;73;No;NicholasCage.png
1963;138;8 1/2;Drama;Mastroianni, Marcello;Cardinale, Claudia;Fellini, Federico;80;Yes;NicholasCage.png
1935;64;One Frightened Night;Horror;Ford, Wallace;Carlisle, Mary;Cabanne, Christy;33;No;NicholasCage.png
1988;103;Year My Voice Broke, The;Drama;Taylor, Noah;Carmen, Loene;Duigan, John;71;No;NicholasCage.png
1966;175;Is Paris Burning?;War;Belmondo, Jean-Paul;Caron, Leslie;Clément, René;63;No;NicholasCage.png
1974;313;QB VII;Drama;Hopkins, Anthony;Caron, Leslie;Gries, Tom;28;Yes;AnthonyHopkins.png
1977;104;Island of Dr. Moreau, The;Horror;Lancaster, Burt;Carrera, Barbara;Taylor, Don;54;No;burtLancaster.png
1983;104;Beyond the Limit;Drama;Caine, Michael;Carrillo, Elpidia;Mackenzie, John;51;No;NicholasCage.png
1936;84;Secret Agent;Mystery;Lorre, Peter;Carroll, Madeleine;Hitchcock, Alfred;50;No;alfredHitchcock.png
1986;71;Paramount Comedy Theater: Well-Developed;Comedy;Mahler, Bruce;Carter, Judy;;40;No;NicholasCage.png
1972;71;Big Bust Out, The;Action;Kendall, Tony;Carter, Karen;Theumer, Ernst R. von;50;No;NicholasCage.png
1987;119;Fourth Protocol, The;Mystery;Caine, Michael;Cassidy, Joanna;Mackenzie, John;14;No;NicholasCage.png
1990;107;Gremlins 2: The New Batch;Comedy;Galligan, Zach;Cates, Phoebe;Dante, Joe;61;No;NicholasCage.png
1982;92;Fast Times at Ridgemont High;Comedy;Penn, Sean;Cates, Phoebe;Heckerling, Amy;65;No;NicholasCage.png
1987;;Mannequin;Comedy;McCarthy, Andrew;Cattrall, Kim;Gottlieb, Michael;23;No;NicholasCage.png
1977;91;Rabid;Horror;Moore, Frank;Chambers, Marilyn;Cronenberg, David;34;No;NicholasCage.png
1990;;Party, The;Comedy;Sellers, Peter;Champion, Marge;Edwards, Blake;32;No;NicholasCage.png
1989;90;Vampire Raiders, Ninja Queen;Action;Peterson, Chris;Chan, Agnes;Lambert, Bruce;15;No;NicholasCage.png
1970;26;Bloopers from Star Trek;Comedy;Lawford, Peter;Channing, Carol;;22;No;NicholasCage.png
1943;99;Destroyer;Action;Robinson, Edward G.;Chapman, Marguerite;Seiter, William A.;87;No;NicholasCage.png
1992;99;Party Girl;Comedy;Taylor, Robert;Charisse, Cyd;Ray, Nicholas;85;No;NicholasCage.png
1989;113;Twin Peaks;Mystery;MacLachlan, Kyle;Chen, Joan;Lynch, David;86;No;kyle.png
1987;103;Moonstruck;Comedy;Cage, Nicholas;Cher;Jewison, Norman;6;Yes;NicholasCage.png
1987;119;Witches of Eastwick, The;Comedy;Nicholson, Jack;Cher;Miller, George;8;No;NicholasCage.png
1979;128;Moonraker;Action;Moore, Roger;Chiles, Lois;Gilbert, Lewis;32;No;NicholasCage.png
1984;106;Beat Street;Drama;Davis, Guy;Chong, Rae Dawn;Lathan, Stan;72;No;NicholasCage.png
1986;88;Running Out of Luck;Comedy;Jagger, Mick;Chong, Rae Dawn;;16;No;NicholasCage.png
1989;90;Never on Tuesday;Drama;Lauer, Andrew;Christian, Claudia;Rifkin, Adam;77;No;NicholasCage.png
1975;109;Shampoo;Comedy;Beatty, Warren;Christie, Julie;Ashby, Hal;69;Yes;NicholasCage.png
1985;111;Power;Drama;Hackman, Gene;Christie, Julie;Lumet, Sidney;43;No;NicholasCage.png
1965;122;Darling;Drama;Harvey, Laurence;Christie, Julie;Schlesinger, John;44;Yes;NicholasCage.png
1963;120;Ugly American, The;Drama;Brando, Marlon;Church, Sandra;Englund, George;63;No;brando.png
1931;68;Ambassador Bill;Comedy;Rogers, Will;Churchill, Marguerite;Taylor, Sam;66;No;NicholasCage.png
1931;110;Big Trail, The;Western;Wayne, John;Churchill, Marguerite;Walsh, Raoul;22;No;johnWayne.png
1967;111;Hombre;Western;Newman, Paul;Cilento, Diane;Ritt, Martin;50;No;paulNewman.png
1968;103;Coogan's Bluff;Action;Eastwood, Clint;Clark, Susan;Siegel, Don;57;No;clintEastwood.png
1989;91;Penn & Teller Get Killed;Comedy;Penn, Jillette;Clarke, Caitlin;Penn, Arthur;12;No;NicholasCage.png
1987;118;Shy People;Drama;Philbin, John;Clayburgh, Jill;Konchalovsky, Andrei;7;No;NicholasCage.png
1980;91;It's My Turn;Comedy;Douglas, Michael;Clayburgh, Jill;Weill, Claudia;0;No;NicholasCage.png
1988;119;Dangerous Liaisons;Drama;Malkovich, John;Close, Glenn;Frears, Stephen;77;No;MichellePfeiffer.png
1990;111;Reversal of Fortune;Drama;Irons, Jeremy;Close, Glenn;Schroeder, Barbet;73;Yes;NicholasCage.png
1991;119;Meeting Venus;Comedy;Arestrup, Niels;Close, Glenn;Szabó, István;74;No;NicholasCage.png
1946;105;Tomorrow Is Forever;Drama;Welles, Orson;Colbert, Claudette;;65;No;NicholasCage.png
1987;101;Like Father Like Son;Comedy;Cameron, Kirk;Colin, Margaret;Daniel, Rod;20;No;NicholasCage.png
1948;81;Rope;Drama;Stewart, James;Collier, Constance;Hitchcock, Alfred;39;No;alfredHitchcock.png
1962;91;Road to Hong Kong;Comedy;Hope, Bob;Collins, Joan;Panama, Norman;37;No;NicholasCage.png
1989;108;Shirley Valentine;Comedy;Conti, Tom;Collins, Pauline;Gilbert, Lewis;51;No;NicholasCage.png
1992;135;City of Joy;Drama;Swayze, Patrick;Collins, Pauline;Joffe, Roland;87;No;NicholasCage.png
1966;99;Appaloosa, The;Western;Brando, Marlon;Comer, Anjanette;Furie, Sidney J.;15;No;brando.png
1986;88;Seven Minutes in Heaven;Comedy;Thames, Byron;Connelly, Jennifer;Feferman, Linda;49;No;NicholasCage.png
1991;96;Hearts of Darkness, A Filmmaker's Apocalypse;Drama;Bottoms, Sam;Coppola, Eleanor;Bahr, Fax;72;No;NicholasCage.png
1961;66;Tonight for Sure;Comedy;Lee, Karla;Cornell, Laura;Coppola, Francis Ford;4;No;NicholasCage.png
1990;110;White Hunter, Black Heart;Adventure;Eastwood, Clint;Cornwell, Charlotte;Eastwood, Clint;66;No;clintEastwood.png
1962;110;Sundays & Cybele;Drama;Kruger, Hardy;Courcel, Nicole;Bourguignon, Serge;11;Yes;NicholasCage.png
1989;90;Puppet Master;Science Fiction;LeMat, Paul;Crampton, Barbara;Schmoeller, David;20;No;NicholasCage.png
1991;95;Night Gallery;Horror;McDowall, Roddy;Crawford, Joan;Spielberg, Steven;31;No;NicholasCage.png
1989;103;Pet Sematary;Horror;Gwynne, Fred;Crosby, Denise;Lambert, Mary;27;No;NicholasCage.png
1992;60;America's Music, Gospel;Music;Phipps, Wentley;Crouch, Sandra;Walton, Kip;13;No;NicholasCage.png
1977;123;Slap Shot;Comedy;Newman, Paul;Crouse, Lindsay;Hill, George Roy;82;No;paulNewman.png
1987;109;O. C. & Stiggs;Comedy;Jenkins, Daniel H.;Curtin, Jane;Altman, Robert;3;No;NicholasCage.png
1988;108;A Fish Called Wanda;Comedy;Cleese, John;Curtis, Jamie Lee;Crichton, Charles;7;Yes;NicholasCage.png
1954;96;A Lesson in Love;Comedy;Björnstrand, Gunnar;Dahlbeck, Eva;Bergman, Ingmar;48;No;Bergman.png
1957;82;Brink of Life;Drama;Josephson, Erland;Dahlbeck, Eva;Bergman, Ingmar;57;No;Bergman.png
1986;120;Betty Blue;Drama;Anglade, Jean-Hughes;Dalle, Béatrice;Beineix, Jean-Jacques;71;No;NicholasCage.png
1979;122;Hair;Music;Savage, John;D'Angelo, Beverly;Forman, Milos;67;No;NicholasCage.png
1989;97;National Lampoon's Christmas Vacation;Comedy;Chase, Chevy;D'Angelo, Beverly;S, Jeremiah;81;No;NicholasCage.png
1974;124;Dersu Uzala, (The Hunter);Adventure;Solomin, Yuri;Danilchenko, Svetlana;Kurosawa, Akira;81;Yes;NicholasCage.png
1990;106;Alice;Comedy;Baldwin, Alec;Danner, Blythe;Allen, Woody;22;No;woody.png
1980;90;Fifth Floor, The;Mystery;Hopkins, Bo;D'Arbanville, Patti;Avedis, Howard Hikmet;74;No;NicholasCage.png
1990;94;Snow Kill;Drama;Knox, Terence;D'Arbanville, Patti;Wright, Thomas J.;35;No;NicholasCage.png
1971;74;People, The;Drama;Shatner, William;Darby, Kim;Coppola, Francis Ford;36;No;NicholasCage.png
1969;128;True Grit;Western;Wayne, John;Darby, Kim;Hathaway, Henry;77;Yes;johnWayne.png
1942;18;Battle of Midway, The;War;Crisp, Donald;Darwell, Jane;Ford, John;75;No;johnFord.png
1948;103;Three Godfathers;Western;Wayne, John;Darwell, Jane;Ford, John;72;No;johnWayne.png
1965;133;Hush, Hush, Sweet Charlotte;Mystery;Cotten, Joseph;Davis, Bette;Aldrich, Robert;68;No;NicholasCage.png
1946;110;A Stolen Life;Drama;Ford, Glenn;Davis, Bette;Bernhardt, Curtis;20;No;glennFord.png
1939;96;Old Maid, The;Drama;Brent, George;Davis, Bette;Goulding, Edmund;18;No;NicholasCage.png
1950;138;All about Eve;Drama;Sanders, George;Davis, Bette;Mankiewicz, Joseph L.;23;Yes;NicholasCage.png
1986;96;Fly, The;Horror;Goldblum, Jeff;Davis, Geena;Cronenberg, David;33;No;NicholasCage.png
1990;89;Quick Change;Comedy;Murray, Bill;Davis, Geena;Franklin, Howard ;24;No;NicholasCage.png
1988;93;Lair of the White Worm, The;Horror;Grant, Hugh;Davis, Sammi;Russell, Ken;16;No;NicholasCage.png
1989;104;Rainbow, The;Drama;Hemmings, David;Davis, Sammi;Russell, Ken;53;No;NicholasCage.png
1956;120;Man Who Knew Too Much, The;Mystery;Stewart, James;Day, Doris;Hitchcock, Alfred;15;No;alfredHitchcock.png
1992;90;Beauty & the Beast;Science Fiction;Marais, Jean;Day, Josette;Cocteau, Jean;14;No;NicholasCage.png
1940;120;Foreign Correspondent;Mystery;McCrea, Joel;Day, Laraine;Hitchcock, Alfred;61;No;alfredHitchcock.png
1949;115;Heiress, The;Drama;Richardson, Ralph;De Havilland, Olivia;Wyler, William;81;Yes;NicholasCage.png
1986;120;Boy Who Could Fly, The;Drama;Underwood, Jay;Deakins, Lucy;Castle, Nick;25;No;NicholasCage.png
1975;89;Terrorists, The;Action;Connery, Sean;Dean, Isabel;Wrede, Caspar;4;No;seanConnery.png
1942;85;Wheel of Fortune;Drama;Wayne, John;Dee, Frances;Auer, John H.;36;No;johnWayne.png
1989;120;Do the Right Thing;Drama;Aiello, Danny;Dee, Ruby;Lee, Spike;5;No;NicholasCage.png
1990;93;Court-Martial of Jackie Robinson, The;Drama;Braugher, Andre;Dee, Ruby;Peerce, Larry;33;No;NicholasCage.png
1967;90;Elvira Madigan;Drama;Berggren, Thommy;Degermark, Pia;Widerberg, Bo;28;No;NicholasCage.png
1992;86;Hurricane Smith;Action;Weathers, Carl;Delaney, Cassandra;Budds, Colin;16;No;NicholasCage.png
1987;86;Fair Game;Action;Ford, Peter;Delaney, Cassandra;;24;No;NicholasCage.png
1989;95;Rape of the Sabines, The;Action;Moore, Roger;Demongeot, Mylene;;83;No;NicholasCage.png
1983;99;Risky Business;Comedy;Cruise, Tom;DeMornay, Rebecca;Brickman, Paul;28;No;NicholasCage.png
1980;103;I Love All of You (Je Vous Aime);Drama;Depardieu, Gérard;Deneuve, Catherine;Berri, Claude;40;No;NicholasCage.png
1986;108;Love Songs;Drama;Lambert, Christopher;Deneuve, Catherine;Chouraqui, Elie;15;No;NicholasCage.png
1983;114;Le Choix des Armes;Mystery;Montand, Yves;Deneuve, Catherine;Comeau, Alain;15;No;NicholasCage.png
1981;135;Choice of Arms;Action;Montand, Yves;Deneuve, Catherine;Corneau, Alan;87;No;NicholasCage.png
1977;107;March or Die;War;Hackman, Gene;Deneuve, Catherine;Richards, Dick;59;No;NicholasCage.png
1980;135;Last Metro, The;Drama;Depardieu, Gérard;Deneuve, Catherine;Truffaut, François;66;No;NicholasCage.png
1986;120;Jean de Florette;Drama;Montand, Yves;Depardieu, Elizabeth;Berri, Claude;87;Yes;NicholasCage.png
1989;127;Fat Man & Little Boy;Drama;Newman, Paul;Dern, Laura;Joffe, Roland;86;No;paulNewman.png
1990;125;Wild at Heart;Drama;Cage, Nicolas;Dern, Laura;Lynch, David;6;No;NicholasCage.png
1989;113;Family Business;Action;Connery, Sean;DeSoto, Rosana;Lumet, Sidney;5;No;seanConnery.png
1988;103;Stand & Deliver;Drama;Olmos, Edward James;DeSoto, Rosana;Menendez, Ramon;19;No;NicholasCage.png
1981;94;Looker;Science Fiction;Finney, Albert;Dey, Susan;Crichton, Michael;62;No;NicholasCage.png
1989;89;Fire & Rain;Action;Haid, Charles;Dickinson, Angie;Jameson, Jerry;10;No;NicholasCage.png
1990;56;Best of Candid Camera, The;Comedy;Allen, Woody;Dickinson, Angie;;12;No;woody.png
1940;83;Seven Sinners;Drama;Wayne, John;Dietrich, Marlene;Garnett, Tay;24;No;johnWayne.png
1961;190;Judgment at Nuremberg;Drama;Tracy, Spencer;Dietrich, Marlene;Kramer, Stanley;39;Yes;spencerTracy.png
1989;60;Minsky's Follies;Comedy;Taylor, Rip;Diller, Phyllis;;12;No;NicholasCage.png
1990;97;Novice, The;Comedy;Sharif, Omar;Dombasle, Arielle;;72;No;NicholasCage.png
1987;130;Wings of Desire;Drama;Ganz, Bruno;Dommartin, Solveig;Wenders, Wim;71;No;NicholasCage.png
1991;158;Until the End of the World;Drama;Hurt, William;Dommartin, Solveig;Wenders, Wim;57;No;NicholasCage.png
1987;118;Castaway;Drama;Reed, Oliver;Donohoe, Amanda;Roeg, Nicolas;41;No;NicholasCage.png
1993;30;Alfred Hitchcock Presents, Sorcerer's Apprentice;Mystery;Hitchcock, Alfred;Dors, Diana;;60;No;NicholasCage.png
1991;99;Delicatessen;Comedy;Benezech, Pascal;Dougnac, Marie-Laure;Caro, Marc;78;No;NicholasCage.png
1979;110;Great Train Robbery, The;Mystery;Connery, Sean;Down, Lesley-Anne;Crichton, Michael;7;No;seanConnery.png
1991;110;Hanover Street;Drama;Ford, Harrison;Down, Lesley-Anne;Hyams, Peter;81;No;NicholasCage.png
1991;102;Hunchback;Drama;Hopkins, Anthony;Down, Lesley-Anne;Tuchner, Michael;33;No;AnthonyHopkins.png
1946;97;My Darling Clementine;Western;Fonda, Henry;Downs, Cathy;Ford, John;12;No;johnFord.png
1950;86;Wagon Master;Western;Johnson, Ben;Dru, Joanne;Ford, John;30;No;johnFord.png
1949;93;She Wore a Yellow Ribbon;Western;Wayne, John;Dru, Joanne;Ford, John;84;No;johnWayne.png
1985;90;Fantasy Man;Comedy;Hopkins, Harold;Drynan, Jeanie;Meagher, John;82;No;NicholasCage.png
1986;87;Monster in the Closet;Comedy;Grant, Donald;DuBarry, Denise;Dahlin, Bob;39;No;NicholasCage.png
1992;85;Double Edge;Drama;Eban, Abba;Dunaway, Faye;Kollek, Amos;69;No;clintEastwood.png
1976;116;Network;Comedy;Finch, Peter;Dunaway, Faye;Lumet, Sidney;48;Yes;NicholasCage.png
1974;131;Chinatown;Drama;Nicholson, Jack;Dunaway, Faye;Polanski, Roman;55;Yes;JackNicholson.png
1975;117;Three Days of the Condor;Drama;Redford, Robert;Dunaway, Faye;Pollack, Sydney;87;No;NicholasCage.png
1977;134;Voyage of the Damned;Drama;Sydow, Max von;Dunaway, Faye;Rosenberg, Stuart;34;No;NicholasCage.png
1987;97;Barfly;Drama;Rourke, Mickey;Dunaway, Faye;Schroeder, Barbet;23;No;NicholasCage.png
1990;104;Wait Until Spring, Bandini;Drama;Mantegna, Joe;Dunaway, Faye;;20;No;NicholasCage.png
1947;118;Life with Father;Comedy;Powell, William;Dunne, Irene;Curtiz, Michael;10;No;NicholasCage.png
1943;;A Guy Named Joe;Drama;Tracy, Spencer;Dunne, Irene;Fleming, Victor;42;No;spencerTracy.png
1974;117;Stavisky;Drama;Belmondo, Jean-Paul;Duperey, Anny;Resnais, Alain;1;No;NicholasCage.png
1981;117;Time Bandits;Comedy;Cleese, John;Duvall, Shelley;Gilliam, Terry;5;No;NicholasCage.png
1980;144;Shining, The;Horror;Nicholson, Jack;Duvall, Shelley;Kubrick, Stanley;32;No;JackNicholson.png
1945;91;Flame of Barbary Coast;Western;Wayne, John;Dvorak, Ann;Kane, Joseph;54;No;johnWayne.png
1993;92;Naked Truth, The;Comedy;Sellers, Peter;Eaton, Shirley;;34;No;NicholasCage.png
1979;92;Brood, The;Horror;Reed, Oliver;Eggar, Samantha;Cronenberg, David;51;No;NicholasCage.png
1970;123;Molly Maguires, The;Action;Connery, Sean;Eggar, Samantha;Ritt, Martin;3;No;seanConnery.png
1984;105;Beverly Hills Cop;Comedy;Murphy, Eddie;Eilbacher, Lisa;Brest, Martin;41;No;NicholasCage.png
1991;86;Blind Man's Bluff;Mystery;Urich, Robert;Eilbacher, Lisa;Quinn, James;64;No;NicholasCage.png
1961;140;La Dolce Vita;Drama;Mastroianni, Marcello;Ekberg, Anita;Fellini, Federico;20;No;NicholasCage.png
1966;103;After the Fox;Comedy;Sellers, Peter;Ekland, Britt;De Sica, Vittorio;60;No;NicholasCage.png
1974;127;Man with the Golden Gun, The;Action;Moore, Roger;Ekland, Britt;Hamilton, Guy;41;No;NicholasCage.png
1985;96;Marbella;Action;Taylor, Rod;Ekland, Britt;Hermoso, Miguel;45;No;NicholasCage.png
1967;103;Bobo, The;Comedy;Sellers, Peter;Ekland, Britt;Parrish, Robert;80;No;NicholasCage.png
1993;53;Big Bands, The;Music;Beneke, Tex;Elgart, Les;;48;No;NicholasCage.png
1992;97;Killer Image.;Mystery;Ironside, Michael;Errickson, Krista;Winning, David;8;No;NicholasCage.png
1987;94;Kandyland;Drama;Laulette, Charles;Evenson, Kim;Schnitzer, Robert Allen;41;No;NicholasCage.png
1987;94;Campus Man;Drama;Dye, John;Fairchild, Morgan;Casden, Ron;38;No;NicholasCage.png
1956;101;Jubal;Drama;Ford, Glenn;Farr, Felicia;Daves, Delmer;32;No;glennFord.png
1985;84;Purple Rose of Cairo, The;Comedy;Aiello, Danny;Farrow, Mia;Allen, Woody;20;Yes;woody.png
1984;85;Broadway Danny Rose;Comedy;Allen, Woody;Farrow, Mia;Allen, Woody;14;No;woody.png
1992;108;Husbands & Wives;Comedy;Allen, Woody;Farrow, Mia;Allen, Woody;80;No;woody.png
1986;103;Hannah & Her Sisters;Comedy;Caine, Michael;Farrow, Mia;Allen, Woody;8;Yes;woody.png
1979;115;Hurricane;Action;Robards, Jason;Farrow, Mia;Troell, Jan;8;No;NicholasCage.png
1986;95;Between Two Women;Drama;Nouri, Michael;Fawcett, Farrah;Avnet, John;52;No;NicholasCage.png
1981;96;Cannonball Run, The;Comedy;Reynolds, Burt;Fawcett, Farrah;Needham, Hal;80;No;NicholasCage.png
1936;70;Doughnuts & Society;Comedy;Nugent, Eddie;Fazenda, Louise;Collins, Lewis D.;28;No;NicholasCage.png
1978;450;Holocaust;Drama;Bottoms, Joseph;Feldshuh, Tovah;Chomsky, Marvin J.;1;No;NicholasCage.png
1990;103;Meridian;Science Fiction;Jamieson, Malcolm;Fenn, Sherilyn;Band, Charles;47;No;NicholasCage.png
1992;90;Diary of a Hitman;Drama;Whitaker, Forest;Fenn, Sherilyn;London, Roy;67;No;NicholasCage.png
1988;95;Gor;Action;Reed, Oliver;Ferratti, Rebecca;Kiersch, Fritz;2;No;NicholasCage.png
1987;95;Surrender;Comedy;Caine, Michael;Field, Sally;Belson, Jerry;84;No;NicholasCage.png
1984;112;Places in the Heart;Drama;Harris, Ed;Field, Sally;Benton, Robert;83;Yes;NicholasCage.png
1991;106;Not Without My Daughter;Drama;Molina, Alfred;Field, Sally;Gilbert, Brian;55;No;NicholasCage.png
1977;113;Heroes;Drama;Winkler, Henry;Field, Sally;Kagan, Jeremy Paul;17;No;NicholasCage.png
1981;116;Absence of Malice;Drama;Newman, Paul;Field, Sally;Pollack, Sydney;76;No;paulNewman.png
1979;110;Norma Rae;Drama;Bridges, Beau;Field, Sally;Ritt, Martin;64;Yes;NicholasCage.png
1989;118;Steel Magnolias;Drama;Skerritt, Tom;Field, Sally;Ross, Herbert;66;No;NicholasCage.png
1989;101;Burbs, The;Comedy;Hanks, Tom;Fisher, Carrie;Dante, Joe;42;No;NicholasCage.png
1980;124;Empire Strikes Back, The;Science Fiction;Hamill, Mark;Fisher, Carrie;Kershner, Irvin;33;No;NicholasCage.png
1977;121;Star Wars;Science Fiction;Hamill, Mark;Fisher, Carrie;Lucas, George;44;No;NicholasCage.png
1983;132;Return of the Jedi;Science Fiction;Hamill, Mark;Fisher, Carrie;Marquand, Richard;4;No;NicholasCage.png
1991;104;Hear My Song;Drama;Dunbar, Adrian;Fitzgerald, Tara;Chelsom, Peter;72;No;NicholasCage.png
1956;99;Slightly Scarlet;Action;Payne, John;Fleming, Rhonda;Dwan, Allan;52;No;NicholasCage.png
1957;120;Gunfight at the OK Corral;Western;Lancaster, Burt;Fleming, Rhonda;Sturges, John;84;No;burtLancaster.png
1931;;Range Feud, The;Western;Wayne, John;Fleming, Susan;Lederman, Ross;51;No;johnWayne.png
1990;89;Bloodsucking Pharaohs in Pittsburgh;Comedy;Dengel, Jake;Fletcher, Suzanne;Smithey, Alan;79;No;NicholasCage.png
1972;129;Roma;Drama;Gonzales, Peter;Florence, Fiona;Fellini, Federico;75;No;NicholasCage.png
1979;122;China Syndrome, The;Drama;Douglas, Michael;Fonda, Jane;Bridges, James;43;No;NicholasCage.png
1986;100;Morning After, The;Mystery;Bridges, Jeff;Fonda, Jane;Lumet, Sidney;6;No;NicholasCage.png
1971;114;Klute;Drama;Sutherland, Donald;Fonda, Jane;Pakula, Alan J.;15;Yes;NicholasCage.png
1979;113;Electric Horseman, The;Comedy;Redford, Robert;Fonda, Jane;Pollack, Sydney;34;No;NicholasCage.png
1965;97;Cat Ballou;Comedy;Marvin, Lee;Fonda, Jane;Silverstein, Elliot;62;Yes;NicholasCage.png
1991;;Coming Home;Drama;Voight, Jon;Fonda, Jane;;1;Yes;NicholasCage.png
1940;130;Rebecca;Drama;Olivier, Laurence;Fontaine, Joan;Hitchcock, Alfred;78;Yes;alfredHitchcock.png
1944;96;Jane Eyre;Drama;Welles, Orson;Fontaine, Joan;Stevenson, Robert;44;No;NicholasCage.png
1973;87;Stacey!;Action;Randall, Anne;Ford, Anitra;Sidaris, Andy;31;No;NicholasCage.png
1992;85;Naked Obsession;Mystery;Katt, William;Ford, Maria;Golden, Dan;26;No;NicholasCage.png
1989;83;Stripped to Kill II, Live Girls;Mystery;Lottimer, Ed;Ford, Maria;Ruben, Katt Shea;80;No;NicholasCage.png
1990;94;Rain Killer, The;Mystery;Sharkey, Ray;Ford, Maria;Stein, Ken;10;No;NicholasCage.png
1983;95;Valley Girl;Comedy;Cage, Nicolas;Foreman, Deborah;Coolidge, Martha;30;No;NicholasCage.png
1991;118;Silence of the Lambs, The;Mystery;Hopkins, Anthony;Foster, Jodie;Demme, Jonathan;8;Yes;AnthonyHopkins.png
1988;98;Stealing Home;Drama;Harmon, Mark;Foster, Jodie;Kampmann, Steven ;76;No;NicholasCage.png
1972;92;Napoleon & Samantha;Comedy;Douglas, Michael;Foster, Jodie;McEveety, Bernard;33;No;NicholasCage.png
1988;;Five Corners;Drama;Robbins, Tim;Foster, Jodie;;88;No;NicholasCage.png
1955;;Blackboard Jungle, The;Drama;Ford, Glenn;Francis, Anne;Brooks, Richard;66;No;glennFord.png
1989;103;My Left Foot;Drama;Day-Lewis, Daniel;Fricker, Brenda;Sheridan, Jim;32;Yes;NicholasCage.png
1987;92;Back to the Beach;Comedy;Avalon, Frankie;Funicello, Annette;Hobbs, Lyndall;45;No;NicholasCage.png
1934;85;Painted Veil, The;Drama;Marshall, Herbert;Garbo, Greta;Boleslawski, Richard;57;No;gretaGarbo.png
1931;74;Inspiration;Drama;Apfel, Oscar;Garbo, Greta;Brown, Clarence;66;No;gretaGarbo.png
1930;92;Anna Christie;Drama;Bickford, Charles;Garbo, Greta;Brown, Clarence;0;No;gretaGarbo.png
1926;109;Flesh & the Devil, The;Drama;Gilbert, John;Garbo, Greta;Brown, Clarence;72;No;gretaGarbo.png
1928;90;Woman of Affairs;Drama;Gilbert, John;Garbo, Greta;Brown, Clarence;83;No;gretaGarbo.png
1935;96;Anna Karenina;Drama;March, Fredric;Garbo, Greta;Brown, Clarence;35;Yes;gretaGarbo.png
1936;110;Camille;Drama;Taylor, Robert;Garbo, Greta;Cukor, George;74;No;gretaGarbo.png
1931;91;Mata Hari;Drama;Novarro, Ramon;Garbo, Greta;Fitzmaurice, George;67;No;gretaGarbo.png
1929;100;Wild Orchids;Drama;Stone, Lewis;Garbo, Greta;Franklin, Sidney;70;No;gretaGarbo.png
1932;112;Grand Hotel;Drama;Barrymore, John;Garbo, Greta;Goulding, Edmund;81;Yes;gretaGarbo.png
1931;84;Susan Lennox, Her Fall & Rise;Drama;Hale, Alan;Garbo, Greta;Leonard, Robert Z.;64;No;gretaGarbo.png
1939;108;Ninotchka;Comedy;Douglas, Melvyn;Garbo, Greta;Lubitsch, Ernst;40;No;gretaGarbo.png
1933;97;Queen Christina;Drama;Gilbert, John;Garbo, Greta;Mamoulian, Rouben;82;No;gretaGarbo.png
1928;96;Mysterious Lady, The;Drama;Nagel, Conrad;Garbo, Greta;Niblo, Fred;72;No;gretaGarbo.png
1925;125;Joyless Street;Drama;Stuart, Henry;Garbo, Greta;Pabst, Georg Wilhelm;73;No;gretaGarbo.png
1929;74;Single Standard, The;Drama;Asther, Nils;Garbo, Greta;Robertson, John S.;73;No;gretaGarbo.png
1932;71;As You Desire Me;Drama;Douglas, Melvyn;Garbo, Greta;;85;No;gretaGarbo.png
1930;76;Romance;Drama;Stone, Lewis;Garbo, Greta;;62;No;gretaGarbo.png
1962;105;A Child Is Waiting;Drama;Lancaster, Burt;Garland, Judy;Cassavetes, John;60;No;burtLancaster.png
1982;116;Tootsie;Comedy;Hoffman, Dustin;Garr, Teri;Pollack, Sydney;8;Yes;NicholasCage.png
1989;86;Let It Ride;Comedy;Dreyfuss, Richard;Garr, Teri;Pytka, Joe;88;No;NicholasCage.png
1953;120;Julius Caesar;Drama;Brando, Marlon;Garson, Greer;Mankiewicz, Joseph L.;50;No;brando.png
1979;120;Nineteen Forty-One;Comedy;Belushi, John;Gary, Lorraine;Spielberg, Steven;24;No;NicholasCage.png
1975;124;Jaws;Action;Scheider, Roy;Gary, Lorraine;Spielberg, Steven;6;No;NicholasCage.png
1987;93;Hot Pursuit;Drama;Cusack, John;Gazelle, Wendy;Lisberger, Steven;44;No;NicholasCage.png
1989;120;Triumph of the Spirit;Drama;Dafoe, Willem;Gazelle, Wendy;Young, Robert M.;49;No;NicholasCage.png
1975;111;Brannigan;Drama;Wayne, John;Geeson, Judy;Hickox, Douglas;64;No;johnWayne.png
1979;89;Buffet Froid;Comedy;Depardieu, Gérard;Gence, Denise;Blier, Bertrand;75;No;NicholasCage.png
1986;122;Salvador;Drama;Woods, James;Gibb, Cynthia;Stone, Oliver;77;No;NicholasCage.png
1959;102;Horse Soldiers, The;Western;Wayne, John;Gibson, Althea;Ford, John;76;No;johnWayne.png
1954;108;Long John Silver;Action;Newton, Robert;Gilchrist, Connie;Haskin, Byron;56;No;NicholasCage.png
1961;134;Hustler, The;Drama;Newman, Paul;Gleason, Jackie;Rossen, Robert;43;Yes;paulNewman.png
1983;109;Star Chamber, The;Drama;Douglas, Michael;Gless, Sharon;Hyam, Peter;3;No;NicholasCage.png
1988;100;Clara's Heart;Drama;Ontkean, Michael;Goldberg, Whoopi;Mulligan, Robert;60;No;NicholasCage.png
1987;102;Burglar;Comedy;Goldthwait, Bob;Goldberg, Whoopi;Wilson, Hugh;44;No;NicholasCage.png
1986;120;Comic Relief;Comedy;Crystal, Billy;Goldberg, Whoopi;;69;No;NicholasCage.png
1978;117;Bloodbrothers;Drama;Sorvino, Paul;Goldoni, Lelia;Mulligan, Robert;11;No;NicholasCage.png
1988;134;Rain Man;Drama;Hoffman, Dustin;Golino, Valeria;Levinson, Barry;8;Yes;NicholasCage.png
1966;95;Masculine Feminine;Drama;Leaud, Jean-Pierre;Goya, Chantal;Godard, Jean-Luc;20;No;NicholasCage.png
1964;51;Outer Limits, The;Science Fiction;Perrin, Vic;Grahame, Gloria;Stanley, Paul;27;No;NicholasCage.png
1988;;Mama's Dirty Girls;Horror;Currie, Sondra;Grahame, Gloria;;62;No;NicholasCage.png
1979;180;Last Ride of the Dalton Gang, The;Western;Palance, Jack;Greenbush, Lindsay;Curtis, Dan;62;No;NicholasCage.png
1991;;Why Me?;Comedy;Lambert, Christopher;Greist, Kim;;74;No;NicholasCage.png
1932;66;Number Seventeen;Crime;Lion, Leon M.;Grey, Anne;Hitchcock, Alfred;66;No;alfredHitchcock.png
1986;120;Manhunter;Drama;Petersen, William L.;Griest, Kim;Mann, Michael;19;No;NicholasCage.png
1990;126;Bonfire of the Vanities, The;Drama;Hanks, Tom;Griffith, Melanie;De Palma, Brian;82;No;NicholasCage.png
1988;115;Working Girl;Comedy;Ford, Harrison;Griffith, Melanie;Nichols, Mike;25;No;NicholasCage.png
1992;133;Shining Through;Mystery;Douglas, Michael;Griffith, Melanie;Seltzer, David;11;No;NicholasCage.png
1991;76;Slumber Party Massacre III;Horror;Christian, Keely;Grye, Brittain;;40;No;NicholasCage.png
1988;99;Tokyo Pop;Comedy;Tadokoro, Yutaka;Hamilton, Carrie;Kuzui, Fran Rubel;2;No;NicholasCage.png
1991;136;Terminator 2;Action;Schwarzenegger, Arnold;Hamilton, Linda;Cameron, James;8;No;T2.png
1984;108;Terminator, The;Action;Schwarzenegger, Arnold;Hamilton, Linda;Cameron, James;17;No;T2.png
1986;105;King Kong Lives!;Action;Kerwin, Brian;Hamilton, Linda;Guillermin, John;20;No;NicholasCage.png
1969;125;Those Daring Young Men in Their Jaunty;Comedy;Curtis, Tony;Hampshire, Susan;;59;No;NicholasCage.png
1991;186;At Play in the Fields of the Lord;Drama;Berenger, Tom;Hannah, Daryl;Babenco, Hector;81;No;NicholasCage.png
1990;;Crazy People;Comedy;Moore, Dudley;Hannah, Daryl;Bill, Tony;61;No;NicholasCage.png
1992;99;Memoirs of an Invisible Man;Comedy;Chase, Chevy;Hannah, Daryl;Carpenter, John;58;No;NicholasCage.png
1985;100;Clan of the Cave Bear, The;Drama;Remar, James;Hannah, Daryl;Chapman, Michael;73;No;NicholasCage.png
1983;82;Final Terror, The;Horror;Zmed, Adrian;Hannah, Daryl;Davis, Andrew;24;No;NicholasCage.png
1984;93;Reckless;Drama;Quinn, Aidan;Hannah, Daryl;Foley, James;14;No;NicholasCage.png
1989;;High Spirits;Comedy;O'Toole, Peter;Hannah, Daryl;Jordan, Neil;53;No;NicholasCage.png
1987;107;Roxanne;Comedy;Martin, Steve;Hannah, Daryl;Schepisi, Fred;66;No;NicholasCage.png
1982;117;Blade Runner;Action;Ford, Harrison;Hannah, Daryl;Scott, Ridley;1;No;NicholasCage.png
1987;126;Wall Street;Drama;Douglas, Michael;Hannah, Daryl;Stone, Oliver;6;Yes;NicholasCage.png
1992;111;Pope of Greenwich Village;Drama;Rourke, Mickey;Hannah, Daryl;;58;No;NicholasCage.png
1989;89;After School;Drama;Bottoms, Sam;Hannah, Page;;59;No;NicholasCage.png
1938;298;Flaming Frontiers;Western;Brown, Johnny Mack;Hansen, Eleanor;Taylor, Ray;82;No;NicholasCage.png
1936;89;Libeled Lady;Comedy;Powell, William;Harlow, Jean;Conway, Jack;86;No;NicholasCage.png
1976;99;Inserts;Drama;Dreyfuss, Richard;Harper, Jessica;Byrum, John;85;No;NicholasCage.png
1988;88;Blue Iguana, The;Drama;McDermott, Dylan;Harper, Jessica;Lafia, John;65;No;NicholasCage.png
1983;93;Tender Mercies;Drama;Duvall, Robert;Harper, Tess;Beresford, Bruce;61;Yes;NicholasCage.png
1987;96;Nights in White Satin;Drama;Gilman, Kenneth;Harris, Priscilla;Barnard, Michael;5;No;NicholasCage.png
1989;87;Videodrome;Horror;Woods, James;Harry, Deborah;Cronenberg, David;36;No;NicholasCage.png
1991;96;Intimate Stranger;Mystery;Russo, James;Harry, Deborah;Holzman, Allan;23;No;NicholasCage.png
1986;110;Highlander;Science Fiction;Lambert, Christopher;Hart, Roxanne;Mulcahy, Russell;8;No;NicholasCage.png
1987;93;Bodycount;Action;White, Bernie;Hassett, Marilyn;;51;No;NicholasCage.png
1989;104;Tango & Cash;Action;Stallone, Sylvester;Hatcher, Teri;Konchalovsky, Andrei;9;No;NicholasCage.png
1970;94;There's a Girl in My Soup;Comedy;Sellers, Peter;Hawn, Goldie;Boulting, Roy;41;No;NicholasCage.png
1984;100;Swing Shift;Drama;Russell, Kurt;Hawn, Goldie;Demme, Jonathan;81;No;NicholasCage.png
1978;112;Foul Play;Comedy;Chase, Chevy;Hawn, Goldie;Higgins, Colin;46;No;NicholasCage.png
1982;109;Best Friends;Comedy;Reynolds, Burt;Hawn, Goldie;Jewison, Norman;74;No;NicholasCage.png
1972;109;Butterflies Are Free;Drama;Albert, Edward;Hawn, Goldie;Katselas, Milton;82;Yes;NicholasCage.png
1987;112;Overboard;Comedy;Russell, Kurt;Hawn, Goldie;Marshall, Garry;6;No;NicholasCage.png
1974;103;Girl from Petrovka, The;Drama;Holbrook, Hal;Hawn, Goldie;Miller, Robert Ellis;23;No;NicholasCage.png
1992;102;Housesitter;Comedy;Martin, Steve;Hawn, Goldie;Oz, Frank;14;No;NicholasCage.png
1986;106;Wildcats;Comedy;Keach, James;Hawn, Goldie;Ritchie, Michael;22;No;NicholasCage.png
1984;100;Protocol;Comedy;Sarandon, Chris;Hawn, Goldie;Ross, Herbert;53;No;NicholasCage.png
1980;102;Seems Like Old Times;Comedy;Chase, Chevy;Hawn, Goldie;Sandrich, Jay;49;No;NicholasCage.png
1974;109;Sugarland Express, The;Drama;Johnson, Ben;Hawn, Goldie;Spielberg, Steven;28;No;NicholasCage.png
1980;110;Private Benjamin;Comedy;Assante, Armand;Hawn, Goldie;Zieff, Howard;61;No;NicholasCage.png
1991;115;Deceived;Mystery;Heard, John;Hawn, Goldie;;55;No;NicholasCage.png
1931;95;Arrowsmith;Drama;Colman, Ronald;Hayes, Helen;Ford, John;84;No;johnFord.png
1972;78;Say Goodbye Maggie Cole;Drama;McGavin, Darren;Hayward, Susan;Taylor, Jud;84;No;NicholasCage.png
1964;132;Circus World;Drama;Wayne, John;Hayworth, Rita;Hathaway, Henry;29;No;johnWayne.png
1952;98;Affair in Trinidad;Drama;Ford, Glenn;Hayworth, Rita;Sherman, Vincent;49;No;glennFord.png
1948;87;Lady from Shanghai;Mystery;Welles, Orson;Hayworth, Rita;Welles, Orson;16;No;NicholasCage.png
1940;81;Lady in Question;Drama;Aherne, Brian;Hayworth, Rita;Vidor, Charles;57;No;NicholasCage.png
1946;110;Gilda;Drama;Ford, Glenn;Hayworth, Rita;Vidor, Charles;57;No;glennFord.png
1948;98;Loves of Carmen, The;Drama;Ford, Glenn;Hayworth, Rita;Vidor, Charles;48;No;glennFord.png
1990;105;Dick Tracy;Comedy;Beatty, Warren;Headley, Glenne;Beatty, Warren;84;No;NicholasCage.png
1964;130;Marnie;Drama;Connery, Sean;Hedren, Tippi;Hitchcock, Alfred;2;No;seanConnery.png
1987;85;Hot Child in the City;Mystery;Prysirr, Geof;Hendrix, Leah Ayres;Florea, John;0;No;NicholasCage.png
1984;90;Johnny Dangerously;Comedy;Piscopo, Joe;Henner, Marilu;Heckerling, Amy;3;No;NicholasCage.png
1985;95;Stark;Mystery;Surovy, Nicolas;Henner, Marilu;Holcomb, Rod;27;No;NicholasCage.png
1949;84;Three Strange Loves;Drama;Malmsten, Birger;Henning, Eva;Bergman, Ingmar;87;No;Bergman.png
1964;170;My Fair Lady;Music;Harrison, Rex;Hepburn, Audrey;Cukor, George;10;Yes;NicholasCage.png
1960;123;Unforgiven, The;Drama;Lancaster, Burt;Hepburn, Audrey;Huston, John;32;No;burtLancaster.png
1976;106;Robin & Marian;Action;Connery, Sean;Hepburn, Audrey;Lester, Richard;6;No;seanConnery.png
1961;109;Children's Hour, The;Drama;Garner, James;Hepburn, Audrey;Wyler, William;60;No;NicholasCage.png
1956;121;Rainmaker, The;Drama;Lancaster, Burt;Hepburn, Katharine;Anthony, Joseph;21;No;katharineHepburn.png
1952;95;Pat & Mike;Comedy;Tracy, Spencer;Hepburn, Katharine;Cukor, George;48;No;spencerTracy.png
1968;134;Lion in Winter, THe;Drama;O'Toole, Peter;Hepburn, Katharine;Harvey, Anthony;78;Yes;katharineHepburn.png
1991;132;Sea of Grass, The;Western;Tracy, Spencer;Hepburn, Katharine;Kazan, Elia;75;No;spencerTracy.png
1967;108;Guess Who's Coming to Dinner;Drama;Tracy, Spencer;Hepburn, Katharine;Kramer, Stanley;50;Yes;spencerTracy.png
1957;153;Desk Set;Comedy;Tracy, Spencer;Hepburn, Katharine;Lang, Walter;51;No;spencerTracy.png
1975;107;Rooster Cogburn;Western;Wayne, John;Hepburn, Katharine;Miller, Stuart;76;No;johnWayne.png
1981;109;On Golden Pond;Drama;Fonda, Henry;Hepburn, Katharine;Rydell, Mark;23;Yes;katharineHepburn.png
1991;101;Adam's Rib;Comedy;Tracy, Spencer;Hepburn, Katharine;;62;No;spencerTracy.png
1991;116;Boom Town;Drama;Tracy, Spencer;Hepburn, Katharine;;73;No;katharineHepburn.png
1991;145;Dragon Seed;Drama;Tracy, Spencer;Hepburn, Katharine;;34;No;katharineHepburn.png
1991;115;Little Women;Drama;Tracy, Spencer;Hepburn, Katharine;;22;No;katharineHepburn.png
1991;113;Philadelphia Story, The;Comedy;Tracy, Spencer;Hepburn, Katharine;;25;No;katharineHepburn.png
1991;112;Without Love;Comedy;Tracy, Spencer;Hepburn, Katharine;;66;No;katharineHepburn.png
1991;113;Woman of the Year;Comedy;Tracy, Spencer;Hepburn, Katharine;;12;No;spencerTracy.png
1992;95;Juice;Drama;Shakur, Tupac;Herron, Cindy;Dickerson, Ernest R.;31;No;NicholasCage.png
1986;114;Hoosiers;Drama;Hackman, Gene;Hershey, Barbara;Anspaugh, David;2;No;NicholasCage.png
1987;112;Tin Men;Comedy;Dreyfuss, Richard;Hershey, Barbara;Levinson, Barry;50;No;NicholasCage.png
1988;163;Last Temptation of Christ, The;Drama;Dafoe, Willem;Hershey, Barbara;Scorsese, Martin;32;No;NicholasCage.png
1991;99;Paris Trout;Drama;Hopper, Dennis;Hershey, Barbara;;53;No;NicholasCage.png
1988;87;Souvenir;Drama;Plummer, Christopher;Hicks, Catherine;Reeve, Geoffrey;42;No;NicholasCage.png
1966;120;A Man for All Seasons;Drama;Shaw, Robert;Hiller, Wendy;Zinnemann, Fred;20;Yes;NicholasCage.png
1986;90;Knights & Emeralds;Drama;Leadbitter, Bill;Hills, Beverly;Emes, Ian;;No;NicholasCage.png
1989;83;Masque of the Red Death;Horror;MacNee, Patrick;Hoak, Clare;Brand, Larry;9;No;NicholasCage.png
1943;265;Adventures of Smilin' Jack, The;Mystery;Brown, Tom;Hobart, Rose;Taylor, Ray;77;No;NicholasCage.png
1992;88;Adventures in Dinosaur City;Action;Katz, Omri;Hoffman, Shawn;Thompson, Brett;19;No;NicholasCage.png
1987;95;Allnighter, The;Comedy;Terlesky, John;Hoffs, Susanna;Hoffs, Tamar Simon;71;No;NicholasCage.png
1980;99;Caddyshack;Comedy;Chase, Chevy;Holcomb, Sarah;Ramis, Harold;70;No;NicholasCage.png
1973;102;Tom Sawyer;Music;Whitaker, Johnny;Holm, Celeste;Taylor, Don;11;No;NicholasCage.png
1987;94;Rita, Sue & Bob Too;Comedy;Finneran, Siohban;Holmes, Michelle;Clarke, Alan;5;No;NicholasCage.png
1947;56;Hawk of Powder River;Western;Dean, Eddie;Holt, Jennifer;Taylor, Ray;61;No;NicholasCage.png
1928;148;Tempest;Drama;Barrymore, John;Horn, Camilla;Taylor, Sam;33;No;NicholasCage.png
1986;90;Running Mates;Drama;Webb, Greg;Howard, Barbara;Neff, Thomas L.;63;No;NicholasCage.png
1987;105;Prettykill;Drama;Birney, David;Hubley, Season;Kaczender, George;71;No;NicholasCage.png
1934;80;Judge Priest;Drama;Rogers, Will;Hudson, Rochelle;Ford, John;9;No;johnFord.png
1950;104;Harvey;Comedy;Stewart, James;Hull, Josephine;Koster, Henry;42;No;NicholasCage.png
1991;89;If Looks Could Kill;Action;Grieco, Richard;Hunt, Linda;Wilmington, Michael;10;No;NicholasCage.png
1987;94;Raising Arizona;Comedy;Cage, Nicolas;Hunter, Holly;Coen, Joel;23;No;NicholasCage.png
1989;114;Once Around;Comedy;Dreyfuss, Richard;Hunter, Holly;Hallström, Lasse;68;No;NicholasCage.png
1980;110;Loulou;Drama;Depardieu, Gérard;Huppert, Isabelle;Pialat, Maurice;65;No;NicholasCage.png
1982;136;World According to Garp, The;Drama;Williams, Robin;Hurt, Mary Beth;Hill, George Roy;59;No;NicholasCage.png
1980;106;Virus;Science Fiction;Kennedy, George;Hussey, Olivia;Fukasaku, Kinji;62;No;NicholasCage.png
1940;127;Northwest Passage;Action;Tracy, Spencer;Hussey, Ruth;Vidor, King;51;No;spencerTracy.png
1987;112;Gardens of Stone;Drama;Caan, James;Huston, Anjelica;Coppola, Francis Ford;27;No;NicholasCage.png
1989;121;Enemies, a Love Story;Drama;Silver, Ron;Huston, Anjelica;Mazursky, Paul;5;No;NicholasCage.png
1992;102;Addams Family, The;Comedy;Julia, Raul;Huston, Anjelica;Sonnenfeld, B.;8;No;NicholasCage.png
1932;65;Freaks;Horror;Ford, Wallace;Hyams, Leila;Browning, Tod;61;No;NicholasCage.png
1991;108;Necessary Roughness;Comedy;Bakula, Scott;Ireland, Kathy;Dragoti, Stan;60;No;NicholasCage.png
1990;93;A Show of Force;Drama;Garcia, Andy;Irving, Amy;Barreto, Bruno;1;No;NicholasCage.png
1980;129;Competition, The;Drama;Dreyfuss, Richard;Irving, Amy;Oliansky, Joel;45;No;NicholasCage.png
1988;97;Crossing Delancey;Comedy;Riegert, Peter;Irving, Amy;Silver, Joan Micklin;6;No;NicholasCage.png
1982;120;State of Things, The;Drama;Kime, Jeffrey;Isabelle Weingarten.;Wenders, Wim;73;No;NicholasCage.png
1987;89;Business As Usual;Comedy;Thaw, John;Jackson, Glenda;Barrett, Lezli-An;17;No;NicholasCage.png
1973;103;A Touch of Class;Comedy;Segal, George;Jackson, Glenda;Frank, Melvin;79;Yes;NicholasCage.png
1970;129;Women in Love.;Drama;Bates, Alan;Jackson, Glenda;Russell, Ken;18;No;NicholasCage.png
1988;89;Salome's Last Dance;Comedy;Johns, Stratford;Jackson, Glenda;Russell, Ken;76;No;NicholasCage.png
1986;100;Casino;Mystery;Connors, Mike;Jackson, Sherry;Chaffey, Don;5;No;NicholasCage.png
1955;108;Smiles of a Summer Night;Comedy;Björnstrand, Gunnar;Jacobsson, Ulla;Bergman, Ingmar;58;No;Bergman.png
1989;90;New Year's Day;Comedy;Jaglom, Henry;Jakobsen, Maggie;Jaglom, Henry;88;No;NicholasCage.png
1981;132;Mephisto;Drama;Brandauer, Klaus Maria;Janda, Krystyna;Szabó, István;80;Yes;NicholasCage.png
1927;60;Easy Virtue;Mystery;Dyall, Franklin;Jeans, Isabel;Hitchcock, Alfred;45;No;alfredHitchcock.png
1937;59;Swing It Sailor!;Comedy;Ford, Wallace;Jewell, Isabel;;6;No;NicholasCage.png
1991;83;Strictly Business;Comedy;Davidson, Tommy;Johnson, Anne-Marie;Hooks, Kevin;3;No;NicholasCage.png
1983;90;Blame It on Rio;Comedy;Caine, Michael;Johnson, Michelle;Donen, Stanley;10;No;NicholasCage.png
1987;86;Straight to Hell;Action;Hopper, Dennis;Jones, Grace;Cox, Alex;47;No;NicholasCage.png
1990;131;A View to a Kill;Action;Moore, Roger;Jones, Grace;;44;No;NicholasCage.png
1986;100;American Anthem;Drama;Gaylord, Mitch;Jones, Janet;Magnoli, Albert;74;No;NicholasCage.png
1963;99;Bedtime Story;Comedy;Brando, Marlon;Jones, Shirley;Levy, Ralph;7;No;brando.png
1991;117;Courtship of Eddie's Father, The;Comedy;Howard, Ron;Jones, Shirley;;43;No;NicholasCage.png
1988;102;Night Train to Katmandu, THe;Action;Roberts, Pernell;Jovovich, Milla;Wiemer, Robert;43;No;NicholasCage.png
1948;100;Port of Call;Drama;Eklund, Bengt;Jönsson, Nine-Christine;Bergman, Ingmar;29;No;Bergman.png
1973;103;Paper Moon;Comedy;O'Neal, Ryan;Kahn, Madeline;Bogdanovich, Peter;3;Yes;NicholasCage.png
1983;97;Yellowbeard;Comedy;Chapman, Graham;Kahn, Madeline;Damski, Mel;34;No;NicholasCage.png
1975;91;Adventures of Sherlock Holmes' Smarter;Comedy;Wilder, Gene;Kahn, Madeline;Wilder, Gene;42;No;NicholasCage.png
1990;108;Flashback;Comedy;Hopper, Dennis;Kane, Carol;Amurri, Franco;19;No;NicholasCage.png
1977;89;World's Greatest Lover, The;Comedy;Wilder, Gene;Kane, Carol;Wilder, Gene;42;No;NicholasCage.png
1955;67;Killer's Kiss;Mystery;Silvera, Frank;Kane, Irene;Kubrick, Stanley;66;No;NicholasCage.png
1988;103;Deceivers, The;Action;Brosnan, Pierce;Kapoor, Shashi;Meyer, Nicholas;14;No;NicholasCage.png
1983;97;Breathless;Action;Gere, Richard;Kaprisky, Valerie;McBride, Jim;51;No;NicholasCage.png
1989;145;Born on the Fourth of July;Drama;Cruise, Tom;Kava, Caroline;Stone, Oliver;8;Yes;NicholasCage.png
1991;120;Awakenings;Drama;De Niro, Robert;Kavner, Julie;Marshall, Penny;8;No;NicholasCage.png
1977;94;Annie Hall;Comedy;Allen, Woody;Keaton, Diane;Allen, Woody;68;Yes;woody.png
1979;96;Manhattan;Comedy;Allen, Woody;Keaton, Diane;Allen, Woody;82;Yes;woody.png
1981;195;Reds;Drama;Beatty, Warren;Keaton, Diane;Beatty, Warren;76;Yes;NicholasCage.png
1986;105;Crimes of the Heart;Comedy;Shepard, Sam;Keaton, Diane;Beresford, Bruce;84;No;NicholasCage.png
1977;136;Looking for Mr. Goodbar;Drama;Atherton, William;Keaton, Diane;Brooks, Richard;54;No;NicholasCage.png
1972;175;Godfather, The;Drama;Brando, Marlon;Keaton, Diane;Coppola, Francis Ford;8;Yes;brando.png
1974;201;Godfather, Pt 2., The;Drama;Pacino, Al;Keaton, Diane;Coppola, Francis Ford;8;Yes;NicholasCage.png
1976;109;I Will, I Will...For Now;Comedy;Gould, Elliott;Keaton, Diane;Panama, Norman;6;No;NicholasCage.png
1972;86;Play It Again, Sam;Comedy;Allen, Woody;Keaton, Diane;Ross, Herbert;81;No;woody.png
1975;82;Love & Death;Comedy;Allen, Woody;Keaton, Diane;;84;No;woody.png
1973;88;Sleeper;Comedy;Allen, Woody;Keaton, Diane;;59;No;woody.png
1970;130;Fellini Satyricon;Drama;Potter, Martin;Keller, Hiram;Fellini, Federico;88;No;NicholasCage.png
1980;117;Formula, The;Mystery;Scott, George C.;Keller, Marthe;Avildsen, John G.;82;No;NicholasCage.png
1977;143;Black Sunday;Drama;Shaw, Robert;Keller, Marthe;Frankenheimer, John;76;No;NicholasCage.png
1977;124;Bobby Deerfield;Drama;Pacino, Al;Keller, Marthe;Pollack, Sydney;36;No;NicholasCage.png
1972;98;Last of the Red Hot Lovers;Comedy;Arkin, Alan;Kellerman, Sally;Saks, Gene;40;No;NicholasCage.png
1953;116;Mogambo;Action;Gable, Clark;Kelly, Grace;Ford, John;71;No;johnFord.png
1955;103;To Catch a Thief;Mystery;Grant, Cary;Kelly, Grace;Hitchcock, Alfred;69;No;alfredHitchcock.png
1954;113;Rear Window;Mystery;Stewart, James;Kelly, Grace;Hitchcock, Alfred;25;No;alfredHitchcock.png
1945;69;Woman Who Came Back;Drama;Kruger, Otto;Kelly, Nancy;Colmes, Walter;26;No;NicholasCage.png
1939;101;Stanley & Livingstone;Action;Tracy, Spencer;Kelly, Nancy;King, Henry;11;No;spencerTracy.png
1956;129;Bad Seed, The;Horror;Jones, Henry;Kelly, Nancy;LeRoy, Mervyn;69;No;NicholasCage.png
1989;113;Lethal Weapon 2;Action;Gibson, Mel;Kensit, Patsy;Donner, Richard;69;No;NicholasCage.png
1992;79;Blame It on the Bellboy;Comedy;Moore, Dudley;Kensit, Patsy;Herman, Mark;69;No;NicholasCage.png
1927;62;Drop Kick, The;Drama;Barthelmess, Richard;Kent, Barbara;Webb, Millard;;No;NicholasCage.png
1978;145;Superman, The Movie;Action;Brando, Marlon;Kidder, Margot;Donner, Richard;87;No;brando.png
1987;90;Superman IV: The Quest for Peace;Action;Reeve, Christopher;Kidder, Margot;Furie, Sidney J.;77;No;NicholasCage.png
1970;90;Quackser Fortune Has a Cousin in the Bronx;Comedy;Wilder, Gene;Kidder, Margot;Waris, Hussein;49;No;NicholasCage.png
1989;96;Dead Calm;Mystery;Neill, Sam;Kidman, Nicole;Noyce, Phillip;1;No;NicholasCage.png
1990;107;Days of Thunder;Action;Cruise, Tom;Kidman, Nicole;Scott, Tony;3;No;NicholasCage.png
1987;101;My Life As a Dog;Comedy;Glanzelius, Anton;Kinnaman, Melinda;Hallström, Lasse;21;No;NicholasCage.png
1983;;Moon in the Gutter, The;Action;Depardieu, Gérard;Kinski, Nastassia;Beineix, Jean-Jacques;29;No;NicholasCage.png
1984;150;Paris, Texas;Drama;Stanton, Harry Dean;Kinski, Nastassia;Wenders, Wim;27;No;NicholasCage.png
1984;96;Unfaithfully Yours;Comedy;Moore, Dudley;Kinski, Nastassia;Zieff, Howard;73;No;NicholasCage.png
1987;95;Bullseye!;Comedy;Caine, Michael;Kirkland, Sally;Winner, Michael;8;No;NicholasCage.png
1989;104;Erik the Viking;Action;Robbins, Tim;Kitt, Eartha;Jones, Terry;25;No;NicholasCage.png
1987;90;Dragonard;Drama;Reed, Oliver;Kitt, Eartha;Kikoine, Gerard;71;No;NicholasCage.png
1986;90;Hard Choices;Drama;McCleery, Gary;Klenck, Margaret;King, Rick;41;No;NicholasCage.png
1969;102;Rain People, The;Drama;Caan, James;Knight, Shirley;Coppola, Francis Ford;78;No;NicholasCage.png
1984;106;A Year of the Quiet Sun;Drama;Wilson, Scott;Komorowska, Maja;Zanussi, Krzystoff;78;No;NicholasCage.png
1935;54;Desert Trail, The;Western;Wayne, John;Kornman, Mary;Collins, Lewis D.;50;No;johnWayne.png
1990;98;Almost an Angel;Comedy;Hogan, Paul;Kozlowski, Linda;Cornell, John;14;No;NicholasCage.png
1986;98;Crocodile Dundee;Comedy;Hogan, Paul;Kozlowski, Linda;Faiman, Peter;66;No;NicholasCage.png
1977;127;American Friend, The;Mystery;Hopper, Dennis;Kreuzer, Lisa;Wenders, Wim;35;No;NicholasCage.png
1989;119;See You in the Morning;Drama;Bridges, Jeff;Krige, Alice;Pakula, Alan J.;53;No;NicholasCage.png
1987;88;Arrogant, The;Drama;Graham, Gary;Kristel, Sylvia;Blot, Philippe;62;No;NicholasCage.png
1989;86;Dracula's Widow;Horror;Sommer, Josef;Kristel, Sylvia;Coppola, Christopher;55;No;NicholasCage.png
1987;90;Ninja Masters of Death;Action;Peterson, Chris;Kruize, Kelly;Lambert, Bruce;15;No;NicholasCage.png
1990;110;Mystery Train;Comedy;Nagase, Masatoshi;Kudoh, Youki;Jarmusch, Jim;23;No;NicholasCage.png
1978;114;Go Tell the Spartans;War;Lancaster, Burt;Kumagai, Denice;Post, Ted;67;No;burtLancaster.png
1986;89;True Stories;Comedy;Goodman, John;Kurtz, Swoosie;Byrne, David;79;No;NicholasCage.png
1953;94;Ugetsu Monogatari;Drama;Mori, Masayuki;Kyô, Machiki;Mizoguchi, Kenji;82;No;NicholasCage.png
1969;80;Rebel Rousers;Action;Nicholson, Jack;Ladd, Diane;Cohen, Martin B.;44;No;JackNicholson.png
1988;98;Plain Clothes;Comedy;Howard, Arliss;Ladd, Diane;Coolidge, Martha;4;No;NicholasCage.png
1981;119;Whose Life Is It, Anyway?;Drama;Dreyfuss, Richard;Lahti, Christine;Badham, John;62;No;NicholasCage.png
1988;116;Running on Empty;Drama;Hirsch, Judd;Lahti, Christine;Lumet, Sidney;2;No;NicholasCage.png
1990;101;Funny about Love;Comedy;Wilder, Gene;Lahti, Christine;Nimoy, Leonard;60;No;NicholasCage.png
1985;118;A Chorus Line, The Movie;Music;Douglas, Michael;Landers, Audrey;Attenborough, Richard;71;No;NicholasCage.png
1986;84;Stewardess School;Comedy;Most, Donald;Landers, Judy;Blancato, Ken;28;No;NicholasCage.png
1987;109;Big Town, The;Drama;Dillon, Matt;Lane, Diane;Bolt, Ben;11;No;NicholasCage.png
1983;94;Rumble Fish;Drama;Dillon, Matt;Lane, Diane;Coppola, Francis Ford;4;No;NicholasCage.png
1983;91;Outsiders, The;Drama;Howell, C. Thomas;Lane, Diane;Coppola, Francis Ford;56;No;NicholasCage.png
1990;94;Priceless Beauty;Science Fiction;Lambert, Christopher;Lane, Diane;Finch, Charles;7;No;NicholasCage.png
1989;93;Streets of Fire;Action;Paré, Michael;Lane, Diane;Hill, Walter;65;No;NicholasCage.png
1990;115;Men Don't Leave;Drama;Howard, Arliss;Lange, Jessica;Brickman, Paul;66;No;NicholasCage.png
1988;127;Everybody's All American;Romance;Quaid, Dennis;Lange, Jessica;Hackford, Taylor;62;No;NicholasCage.png
1992;128;Cape Fear;Mystery;De Niro, Robert;Lange, Jessica;Scorsese, Martin;7;No;NicholasCage.png
1992;121;Postman Always Rings Twice, The;Mystery;Nicholson, Jack;Lange, Jessica;;24;No;NicholasCage.png
1949;58;Crashing Thru;Western;Wilson, Whip;Larson, Christine;Taylor, Ray;19;No;NicholasCage.png
1978;109;Get Out Your Handkerchiefs;Comedy;Depardieu, Gérard;Laure, Carole;Blier, Bertrand;78;Yes;NicholasCage.png
1971;137;Boy Friend, THe;Music;Gable, Christopher;Lawson, Twiggy;Russell, Ken;8;No;NicholasCage.png
1990;100;Hard To Kill;Action;Seagal, Steven;LeBrock, Kelly;Malmuth, Bruce;49;No;NicholasCage.png
1960;109;Psycho;Horror;Perkins, Anthony;Leigh, Janet;Hitchcock, Alfred;56;No;alfredHitchcock.png
1957;112;Jet Pilot;Action;Wayne, John;Leigh, Janet;Sternberg, Josef von;43;No;johnWayne.png
1987;95;Under Cover;Mystery;Neidorf, David;Leigh, Jennifer Jason;Stockwell, John;36;No;NicholasCage.png
1951;122;A Streetcar Named Desire;Drama;Brando, Marlon;Leigh, Vivien;Kazan, Elia;75;Yes;brando.png
1986;93;Golden Child, The;Comedy;Murphy, Eddie;Lewis, Charlotte;Ritchie, Michael;86;No;NicholasCage.png
1971;84;Statue, The;Drama;Niven, David;Lisi, Virna;Amateau, Rod;80;No;NicholasCage.png
1985;128;Christopher Columbus;Drama;Byrne, Gabriel;Lisi, Virna;Lattuada, Alberto;69;No;NicholasCage.png
1989;116;In Country;Drama;Willis, Bruce;Lloyd, Emily;Jewison, Norman;76;No;NicholasCage.png
1978;132;Wild Geese, The;Action;Burton, Richard;Lloyd, Rosalind;McLaglen, Andrew V.;21;No;NicholasCage.png
1974;90;Second Coming of Suzanne., The;Drama;Dreyfuss, Richard;Locke, Sondra;Barry, Michael;21;No;NicholasCage.png
1980;116;Bronco Billy;Westerns;Eastwood, Clint;Locke, Sondra;Eastwood, Clint;57;No;clintEastwood.png
1977;109;Gauntlet, The;Action;Eastwood, Clint;Locke, Sondra;Eastwood, Clint;18;No;clintEastwood.png
1986;105;Ratboy;Drama;Townsend, Robert;Locke, Sondra;Locke, Sondra;1;No;NicholasCage.png
1938;96;Lady Vanishes;Mystery;Redgrave, Michael;Lockwood, Margaret;Hitchcock, Alfred;27;No;alfredHitchcock.png
1987;95;Kitchen Toto, THe;Drama;Peck, Bob;Logan, Phyllis;Hook, Harry;41;No;NicholasCage.png
1959;88;Carlton-Browne of the F.O.;Comedy;Terry-Thomas;Lohr, Marie;Boulting, Roy;63;No;NicholasCage.png
1929;68;Racketeer;Drama;Armstrong, Robert;Lombard, Carole;Higgin, Howard;2;No;NicholasCage.png
1941;95;Mr. & Mrs. Smith;Comedy;Montgomery, Robert;Lombard, Carole;Hitchcock, Alfred;3;No;alfredHitchcock.png
1986;132;Alrededor de Medianoche;Drama;Francois Cluzet;Lonette McKee;Rayfield, David;47;No;NicholasCage.png
1982;101;Losin' It;Comedy;Cruise, Tom;Long, Shelley;Hanson, Curtis;4;No;NicholasCage.png
1987;114;Into the Homeland;Action;Boothe, Powers;Longstreth, Emily;Glatter, Lesli Linka;34;No;NicholasCage.png
1991;60;Boxing Babes;Action;Nichol, Robin;Lords, Traci;Dell, Stewart;9;No;NicholasCage.png
1991;94;Shock 'em Dead;Horror;Donahue, Troy;Lords, Traci;Freed, Mark;31;No;NicholasCage.png
1960;101;Heller in Pink Tights;Drama;Quinn, Anthony;Loren, Sophia;Cukor, George;52;No;sophiaLoren.png
1961;100;Two Women;Drama;Belmondo, Jean-Paul;Loren, Sophia;De Sica, Vittorio;83;Yes;sophiaLoren.png
1954;107;Gold of Naples, The;Drama;De Sica, Vittorio;Loren, Sophia;De Sica, Vittorio;40;No;sophiaLoren.png
1963;118;Yesterday, Today & Tomorrow;Comedy;Mastroianni, Marcello;Loren, Sophia;De Sica, Vittorio;73;Yes;sophiaLoren.png
1957;109;Legend of the Lost;Action;Wayne, John;Loren, Sophia;Hathaway, Henry;84;No;sophiaLoren.png
1978;111;Brass Target;Action;Cassavetes, John;Loren, Sophia;Hough, John;53;No;sophiaLoren.png
1964;188;Fall of the Roman Empire, The;Drama;Boyd, Stphen;Loren, Sophia;Mann, Anthony;62;No;sophiaLoren.png
1961;172;El Cid;Drama;Heston, Charlton;Loren, Sophia;Mann, Anthony;10;No;sophiaLoren.png
1958;114;Desire under the Elms;Drama;Perkins, Anthony;Loren, Sophia;Mann, Delbert;13;No;sophiaLoren.png
1953;92;Two Nights with Cleo;Drama;Sordi, Alberto;Loren, Sophia;Mattoli, Mario;54;No;sophiaLoren.png
1959;;Black Orchid, The;Drama;Quinn, Anthony;Loren, Sophia;Ritt, Martin;54;No;sophiaLoren.png
1977;91;Angela;Drama;Railsback, Steve;Loren, Sophia;Sagal, Boris;80;No;sophiaLoren.png
1977;105;A Special Day;Drama;Mastroianni, Marcello;Loren, Sophia;Scola, Ettore;80;Yes;sophiaLoren.png
1979;112;Blood Feud;Action;Mastroianni, Marcello;Loren, Sophia;Wertmuller, Lina;52;No;sophiaLoren.png
1991;145;Sophia Loren, Her Own Story;Drama;Gavin, John;Loren, Sophia;;49;No;sophiaLoren.png
1990;;Running Away;Drama;Loggia, Robert;Loren, Sophia;;2;No;sophiaLoren.png
1991;130;Man of La Mancha;Music;O'Toole, Peter;Loren, Sophia;;55;No;sophiaLoren.png
1992;116;Operation Crossbow;Action;Peppard, George;Loren, Sophia;;1;No;sophiaLoren.png
1986;141;Courage;Drama;Williams, Billy Dee;Loren, Sophia;;56;No;sophiaLoren.png
1986;94;RAD;Action;Allen, Bill;Loughlin, Lori;Needham, Hal;75;No;NicholasCage.png
1992;98;Secret Admirer;Comedy;Howell, C. Thomas;Loughlin, Lori;;55;No;NicholasCage.png
1979;85;Cocaine Cowboys;Action;Palance, Jack;Love, Suzanna;Lommel, Ulli;17;No;NicholasCage.png
1991;118;Test Pilot;Drama;Gable, Clark;Loy, Myrna;;13;No;NicholasCage.png
1943;64;Ape Man, The;Horror;Ford, Wallace;Lugosi, Bela;Beaudine, William;83;No;NicholasCage.png
1986;125;Mission, The;Drama;De Niro, Robert;Lunghi, Cherie;Joffe, Roland;20;No;NicholasCage.png
1991;102;Curly Sue;Comedy;Belushi, Jim;Lynch, Kelly;Hughes, John;2;No;NicholasCage.png
1962;150;Lolita;Drama;Mason, James;Lyon, Sue;Kubrick, Stanley;80;No;NicholasCage.png
1989;101;Sex, Lies, and Videotape;Drama;Spader, James;MacDowell, Andie;Soderbergh, Steven;70;Yes;NicholasCage.png
1990;107;Green Card;Comedy;Depardieu, Gérard;MacDowell, Andie;Weir, Peter;25;No;NicholasCage.png
1988;95;Gator Bait II;Action;Muzzcat, Paul;MacKenzie, Jan;Sebastian, Beverly;73;No;NicholasCage.png
1979;129;Being There;Comedy;Sellers, Peter;MacLaine, Shirley;Ashby, Hal;31;Yes;NicholasCage.png
1983;132;Terms of Endearment;Drama;Nicholson, Jack;MacLaine, Shirley;Brooks, James L.;32;Yes;JackNicholson.png
1967;99;Woman Times Seven;Comedy;Sellers, Peter;MacLaine, Shirley;De Sica, Vittorio;36;No;NicholasCage.png
1968;;Bliss of Mrs. Blossom, The;Comedy;Booth, James;MacLaine, Shirley;McGrath, Joseph;86;No;NicholasCage.png
1990;101;Postcards from the Edge;Comedy;Quaid, Dennis;MacLaine, Shirley;Nichols, Mike;63;No;NicholasCage.png
1970;105;Two Mules for Sister Sara;Western;Eastwood, Clint;MacLaine, Shirley;Siegel, Don;36;No;clintEastwood.png
1992;84;Dragonfight;Drama;Z'Dar, Robert;MacLaren, Fawna;;71;No;NicholasCage.png
1939;85;Back Door to Heaven;Drama;Ford, Wallace;MacMahon, Aline;Howard, William K.;83;No;NicholasCage.png
1988;100;Ciao Italia, Madonna Live from Italy;Music;;Madonna;De Winter, Harry;74;No;NicholasCage.png
1991;118;Madonna, Truth or Dare;Music;;Madonna;Keshishian, Alek;54;No;NicholasCage.png
1992;60;A Certain Sacrifice;Music;Pattnosh, Jeremy;Madonna;Lewicki, Steven Jon;24;No;NicholasCage.png
1991;40;National Enquirer, The Untold Story;Music;White, Vanna;Madonna;;65;No;NicholasCage.png
1990;60;Immaculate Collection, The;Music;;Madonna;;32;No;NicholasCage.png
1987;50;Madonna Live, The Virgin Tour;Music;;Madonna;;75;No;NicholasCage.png
1990;5;Madonna, Justify My Love;Music;;Madonna;;77;No;NicholasCage.png
1991;16;Madonna, Like a Virgin;Music;;Madonna;;63;No;NicholasCage.png
1988;83;Hot to Trot;Comedy;Goldthwait, Bob;Madsen, Virginia;Dinner, Michael;78;No;NicholasCage.png
1986;103;Fire with Fire;Drama;Sheffer, Craig;Madsen, Virginia;Gibbins, Duncan;9;No;NicholasCage.png
1990;120;Hot Spot;Drama;Johnson, Don;Madsen, Virginia;Hopper, Dennis;70;No;NicholasCage.png
1974;124;Amarcord;Drama;Noel, Magali;Maggio, Pupella;Fellini, Federico;50;Yes;NicholasCage.png
1988;85;Casablanca Express;Action;Connery, Jason;Maneri, Luisa;Martino, Sergio;33;No;NicholasCage.png
1980;94;Out of the Blue;Drama;Hopper, Dennis;Manz, Linda;Hopper, Dennis;4;No;NicholasCage.png
1949;110;Sands of Iwo Jima;War;Wayne, John;Mara, Adele;Dwan, Allan;72;No;johnWayne.png
1981;104;Hand, The;Horror;Caine, Michael;Marcovicci, Andrea;Stone, Oliver;44;No;NicholasCage.png
1989;81;Deep Cover;Mystery;Conti, Tom;Markham, Kika;Loncraine, Richard;15;No;NicholasCage.png
1955;92;Il Bidone;Drama;Crawford, Broderick;Masina, Guilietta;Fellini, Federico;70;No;NicholasCage.png
1986;130;El Guerrero Solitario;Drama;Eastwood, Clint;Mason, Marsha;Eastwood, Clint;77;No;clintEastwood.png
1986;130;Heartbreak Ridge;War;Eastwood, Clint;Mason, Marsha;Eastwood, Clint;61;No;clintEastwood.png
1977;110;Goodbye Girl, The;Comedy;Dreyfuss, Richard;Mason, Marsha;Ross, Herbert;6;Yes;NicholasCage.png
1991;113;Audrey Rose;Drama;Hopkins, Anthony;Mason, Marsha;;62;No;AnthonyHopkins.png
1981;86;Polyester;Comedy;Divine;Massey, Edith;;68;No;NicholasCage.png
1991;144;Robin Hood: Prince of Thieves;Action;Costner, Kevin;Mastrantonio, Mary Elizabeth;Costner, Kevin;8;No;NicholasCage.png
1992;101;White Sands;Drama;Dafoe, Willem;Mastrantonio, Mary Elizabeth;Donaldson, Roger;38;No;NicholasCage.png
1986;119;Color of Money, The;Drama;Newman, Paul;Mastrantonio, Mary Elizabeth;Scorsese, Martin;6;Yes;paulNewman.png
1986;119;Children of a Lesser God;Drama;Hurt, William;Matlin, Marlee;Haines, Randa;20;Yes;NicholasCage.png
1986;;Matador;Comedy;Banderas, Antonio;Maura, Carmen;Almodóvar, Pedro;34;No;NicholasCage.png
1989;88;Women on the Verge of a Nervous Breakdown;Comedy;Banderas, Antonio;Maura, Carmen;Almodóvar, Pedro;65;No;NicholasCage.png
1980;86;Pepi Luci Bom;Comedy;Rotaeta, Félix;Maura, Carmen;Almodóvar, Pedro;66;No;NicholasCage.png
1989;100;Forgotten, The;Mystery;Carradine, Keith;Maynard, Mimi;Keach, James;69;No;NicholasCage.png
1992;89;Flame & the Arrow, The;Action;Lancaster, Burt;Mayo, Virginia;;0;No;burtLancaster.png
1990;92;After the Shock;Drama;Kotto, Yaphet;McClanahan, Rue;Sherman, Gary;28;No;NicholasCage.png
1990;110;Modern Love;Comedy;Benson, Robby;McClanahan, Rue;;18;No;NicholasCage.png
1992;95;Riff Raff;Comedy;Carlyle, Robert;McCourt, Emer;Loach, Ken;71;No;NicholasCage.png
1967;81;Glory Stompers, The;Action;Hopper, Dennis;McCrea, Jody;Lanza, Anthony M.;27;No;NicholasCage.png
1990;181;Dances with Wolves;Western;Costner, Kevin;McDonnell, Mary;Costner, Kevin;8;Yes;NicholasCage.png
1987;130;Matewan;Drama;Jones, James Earl;McDonnell, Mary;Sayles, John;81;No;NicholasCage.png
1988;120;Mississippi Burning;Drama;Hackman, Gene;McDormand, Frances;Parker, Alan;41;Yes;NicholasCage.png
1975;130;Eiger Sanction, The;Action;Eastwood, Clint;McGee, Vonetta;Eastwood, Clint;69;No;clintEastwood.png
1988;109;Unsettled Land;Drama;Shea, John;McGillis, Kelly;Barbash, Uri;75;No;NicholasCage.png
1991;98;Cat Chaser;Drama;Weller, Peter;McGillis, Kelly;Ferrera, Abel;6;No;NicholasCage.png
1988;110;Accused, The;Drama;Coulson, Bernie;McGillis, Kelly;Kaplan, Jonathan;71;Yes;NicholasCage.png
1989;109;Winter People;Drama;Russell, Kurt;McGillis, Kelly;Kotcheff, Ted;30;No;NicholasCage.png
1983;101;Reuben, Reuben;Comedy;Conti, Tom;McGillis, Kelly;Miller, Robert Ellis;2;No;NicholasCage.png
1987;102;Made in Heaven;Fantasy;Hutton, Timothy;McGillis, Kelly;Rudolph, Alan;57;No;NicholasCage.png
1986;109;Top Gun;Action;Cruise, Tom;McGillis, Kelly;Scott, Tony;8;No;NicholasCage.png
1985;112;Witness;Drama;Ford, Harrison;McGillis, Kelly;Weir, Peter;59;No;NicholasCage.png
1988;111;House on Carroll Street, The;Mystery;Daniels, Jeff;McGillis, Kelly;;6;No;NicholasCage.png
1984;109;Racing with the Moon;Drama;Penn, Sean;McGovern, Elizabeth;Benjamin, Richard;50;No;NicholasCage.png
1983;98;Lovesick;Comedy;Moore, Dudley;McGovern, Elizabeth;Brickman, Marshall;51;No;NicholasCage.png
1988;106;She's Having a Baby;Comedy;Hughes, Kevin Bacon;McGovern, Elizabeth;;18;No;NicholasCage.png
1965;199;Greatest Story Ever Told, The;Drama;Sydow, Max von;McGuire, Dorothy;Stevens, George;26;No;NicholasCage.png
1989;105;Hawks;Drama;Dalton, Timothy;McTeer, Janet;Miller, Robert Ellis;11;No;NicholasCage.png
1981;91;So Fine;Comedy;O'Neal, Ryan;Melato, Mariangela;Bergman, Andrew;17;No;NicholasCage.png
1957;89;Paths of Glory;Drama;Douglas, Kirk;Menjou, Adolphe;Kubrick, Stanley;47;No;NicholasCage.png
1964;120;Tom Jones;Drama;Ustinov, Peter;Mercouri, Melina;Dassin, Jules;39;Yes;NicholasCage.png
1975;103;Sunshine Boys, The;Comedy;Burns, George;Meredith, Lee;Ross, Herbert;35;Yes;NicholasCage.png
1988;98;Caddyshack 2;Comedy;Mason, Jackie;Merrill, Dina;Arkush, Allan;34;No;NicholasCage.png
1990;117;Internal Affairs;Drama;Gere, Richard;Metcalf, Laurie;Figgis, Mike;3;No;NicholasCage.png
1991;206;JFK;Drama;Costner, Kevin;Metcalf, Laurie;Stone, Oliver;78;No;NicholasCage.png
1991;97;New Jack City;Action;Snipes, Wesley;Michael Michele;Van Peebles, Mario;80;No;NicholasCage.png
1991;87;Scenes from a Mall;Comedy;Allen, Woody;Midler, Bette;;8;No;woody.png
1987;118;Hope & Glory;War;Hayman, David;Miles, Sarah;Boorman, John;3;No;NicholasCage.png
1970;194;Ryan's Daughter;Drama;Mitchum, Robert;Miles, Sarah;Lean, David;81;Yes;NicholasCage.png
1973;127;Man Who Loved Cat Dancing, The;Western;Reynolds, Burt;Miles, Sarah;Sarafian, Richard C.;40;No;NicholasCage.png
1962;123;Man Who Shot Liberty Valance, The;Western;Stewart, James;Miles, Vera;Ford, John;85;No;johnFord.png
1989;102;Dead-Bang;Action;Johnson, Don;Miller, Penelope Ann;Frankenheimer, John;9;No;NicholasCage.png
1988;90;Big Top Pee-wee;Comedy;Reubens, Paul;Miller, Penelope Ann;Kleiser, Randal;17;No;NicholasCage.png
1960;103;Time Machine, The;Science Fiction;Taylor, Rod;Mimieux, Yvette;Pal, George;88;No;NicholasCage.png
1972;128;Cabaret;Drama;Grey, Joel;Minnelli, Liza;Fosse, Bob;59;Yes;NicholasCage.png
1981;97;Arthur;Comedy;Moore, Dudley;Minnelli, Liza;Gordon, Steve;79;Yes;NicholasCage.png
1976;97;A Matter of Time;Drama;Boyer, Charles;Minnelli, Liza;Minnelli, Vincente;70;No;NicholasCage.png
1977;137;New York, New York;Drama;De Niro, Robert;Minnelli, Liza;Scorsese, Martin;8;No;NicholasCage.png
1989;89;Nightmare on Elm Street, Pt. 5, The Dream Child;Horror;Englund, Robert;Minter, Kelly Jo;Hopkins, Stephen;41;No;NicholasCage.png
1980;100;Fiendish Plot of Dr. Fu Manchu, The;Comedy;Sellers, Peter;Mirren, Helen;Haggard, Piers;29;No;NicholasCage.png
1991;240;Four American Composers;Music;Cage, John;Monk, Meredith;Greenaway, Peter;3;No;NicholasCage.png
1950;112;Asphalt Jungle, The;Action;Hayden, Sterling;Monroe, Marilyn;Huston, John;77;No;NicholasCage.png
1992;61;Ladies of the Chorus;Music;Garr, Eddie;Monroe, Marilyn;Karlson, Phil;60;No;NicholasCage.png
1953;95;How to Marry a Millionaire;Comedy;Powell, William;Monroe, Marilyn;Negulesco, Jean;65;No;NicholasCage.png
1983;;Hollywood Out-Takes & Rare Footage;Comedy;Bogart, Humphrey;Monroe, Marilyn;;27;No;NicholasCage.png
1991;94;Nothing But Trouble;Comedy;Candy, John;Moore, Demi;Aykroyd, Dan;25;No;NicholasCage.png
1987;109;Wisdom;Action;Estevez, Emilio;Moore, Demi;Estevez, Emilio;25;No;NicholasCage.png
1986;94;One Crazy Summer;Comedy;Cusack, John;Moore, Demi;Holland, Savage Steve;61;No;NicholasCage.png
1989;110;We're No Angels;Comedy;De Niro, Robert;Moore, Demi;Jordan, Neil;51;No;NicholasCage.png
1984;102;No Small Affair;Comedy;Cryer, Jon;Moore, Demi;Schatzberg, Jerry;10;No;NicholasCage.png
1990;127;Ghost;Science Fiction;Swayze, Patrick;Moore, Demi;Zucker, Jerry;6;Yes;NicholasCage.png
1986;113;About Last Night;Drama;Lowe, Rob;Moore, Demi;Zwick, Edward;66;No;NicholasCage.png
1982;107;Six Weeks;Drama;Moore, Dudley;Moore, Mary Tyler;Bill, Tony;73;No;NicholasCage.png
1948;89;Return of October;Comedy;Ford, Glenn;Moore, Terry;Lewis, Joseph H.;35;No;glennFord.png
1952;99;Come Back, Little Sheba;Drama;Lancaster, Burt;Moore, Terry;Mann, Daniel;50;Yes;burtLancaster.png
1974;117;Going Places;Drama;Depardieu, Gérard;Moreau, Jeanne;Blier, Bertrand;66;No;NicholasCage.png
1970;99;Monte Walsh;Western;Marvin, Lee;Moreau, Jeanne;Fraker, William A.;29;No;NicholasCage.png
1955;100;Mr. Arkadin;Drama;Welles, Orson;Mori, Paola;Welles, Orson;80;No;NicholasCage.png
1988;;White of the Eye;Mystery;Keith, David;Moriarty, Cathy;Cammell, Donald;48;No;NicholasCage.png
1968;90;Producers, The;Comedy;Wilder, Gene;Mostel, Zero;Brooks, Mel;33;No;NicholasCage.png
1976;94;Front, The;Drama;Allen, Woody;Mostel, Zero;Ritt, Martin;70;No;woody.png
1987;86;House of the Rising Sun;Drama;Annese, Frank;Moyer, Tawny;Gold, Greg;45;No;NicholasCage.png
1988;91;In a Shallow Grave;Drama;Biehn, Michael;Mueller, Maureen;Bowser, Kenneth;72;No;NicholasCage.png
1974;111;Mc Q;Action;Wayne, John;Muldaur, Diana;Sturges, John;73;No;johnWayne.png
1941;85;Lady from Louisiana;Drama;Wayne, John;Munson, Ona;Vorhaus, Bernard;38;No;johnWayne.png
1990;102;Wait Until Spring Bandini;Drama;Mantegna, Joe;Muti, Ornella;Deruddere, Dominique;29;No;NicholasCage.png
1940;105;Long Voyage Home, The;Drama;Wayne, John;Natwick, Mildred;Ford, John;88;No;johnWayne.png
1955;100;Trouble with Harry, The;Mystery;Forsythe, John;Natwick, Mildred;Hitchcock, Alfred;28;No;alfredHitchcock.png
1987;60;Encounters;Drama;Von Bergan, Raven;Navarro, Monica;Marder, Bruce;44;No;NicholasCage.png
1963;112;Hud;Drama;Newman, Paul;Neal, Patricia;Ritt, Martin;2;Yes;paulNewman.png
1951;111;Operation Pacific;War;Wayne, John;Neal, Patricia;;5;No;johnWayne.png
1987;83;Surf Nazis Must Die;Horror;Brenner, Barry;Neely, Gail;George, Peter;50;No;NicholasCage.png
1956;124;Teahouse of the August Moon;Drama;Brando, Marlon;Negami, Jun;Mann, Daniel;11;No;brando.png
1992;88;Back in the U.S.S.R.;Action;Whaley, Frank;Negoda, Natalya;;61;No;NicholasCage.png
1970;91;Man Who Haunted Himself, The;Drama;Moore, Roger;Neil, Hildegard;Dearden, Basil;75;No;NicholasCage.png
1991;108;Prisoner of Honor.;Drama;Dreyfuss, Richard;Neilson, Catherine;Russell, Ken;58;No;NicholasCage.png
1988;83;Control;Drama;Lancaster, Burt;Nelligan, Kate;;27;No;burtLancaster.png
1923;57;Desert Rider;Western;Hoxie, Jack;Nelson, Evelyn;Bradbury, Robert N.;;No;NicholasCage.png
1980;109;Wholly Moses!;Comedy;Moore, Dudley;Newman, Laraine;Weis, Gary;25;No;NicholasCage.png
1991;110;Star Trek VI: The Undiscovered Country;Science Fiction;Shatner, William;Nichols, Nichelle;Meyer, Nicholas;11;No;NicholasCage.png
1989;107;Star Trek V: The Final Frontier;Action;Shatner, William;Nichols, Nichelle;Shatner, William ;87;No;NicholasCage.png
1991;85;Circuitry Man;Action;Metzler, Jim;Nicholson, Dana W.;Lovy, Steven;78;No;NicholasCage.png
1986;87;Cobra;Action;Stallone, Sylvester;Nielsen, Brigitte;Cosmatos, George P.;57;No;NicholasCage.png
1987;103;Beverly Hills Cop II;Comedy;Murphy, Eddie;Nielsen, Brigitte;Scott, Tony;37;No;NicholasCage.png
1990;90;Red Sonja;Action;Schwarzenegger, Arnold;Nielsen, Brigitte;;40;No;NicholasCage.png
1950;93;To Joy;Drama;Olin, Stig;Nilsson, Maj-Britt;Bergman, Ingmar;65;No;Bergman.png
1992;112;Macbeth;Drama;Welles, Orson;Nolan, Jeanette;;45;No;NicholasCage.png
1958;128;Vertigo;Drama;Stewart, James;Novak, Kim;Hitchcock, Alfred;10;No;alfredHitchcock.png
1987;91;Young Love: Lemon Popsicle Seven;Comedy;Katzur, Yftach;Noy, Zachi;Bennett, Walter;47;No;NicholasCage.png
1946;93;Crack-Up;Mystery;Marshall, Herbert;O'Brien, Pat;Reis, Irving;25;No;NicholasCage.png
1941;57;Bury Me Not on the Lone Prairie;Western;Brown, Johnny Mack;O'Day, Nell;Taylor, Ray;85;No;NicholasCage.png
1940;57;Law & Order;Western;Brown, Johnny Mack;O'Day, Nell;Taylor, Ray;87;No;NicholasCage.png
1941;56;Man from Montana;Western;Brown, Johnny Mack;O'Day, Nell;Taylor, Ray;85;No;NicholasCage.png
1992;137;Long Gray Line, The;Drama;Power, Tyrone;O'Hara, Maureen;Ford, John;26;No;johnFord.png
1950;105;Rio Grande;Western;Wayne, John;O'Hara, Maureen;Ford, John;64;No;johnWayne.png
1957;107;Wings of Eagles, The;Drama;Wayne, John;O'Hara, Maureen;Ford, John;29;No;johnWayne.png
1939;94;Jamaica Inn;Drama;Laughton, Charles;O'Hara, Maureen;Hitchcock, Alfred;75;No;alfredHitchcock.png
1971;110;Big Jake;Action;Wayne, John;O'Hara, Maureen;Sherman, George;68;No;johnWayne.png
1992;153;Quiet Man, The;Drama;Wayne, John;O'Hara, Maureen;;74;No;johnWayne.png
1983;72;After the Rehearsal;Drama;Josephson, Erland;Olin, Lena;Bergman, Ingmar;0;No;Bergman.png
1952;90;Big Jim McLain;Western;Wayne, John;Olson, Nancy;Ludwig, Edward;14;No;johnWayne.png
1969;101;Smith!;Western;Ford, Glenn;Olson, Nancy;O'Herlihy, Michael;62;No;glennFord.png
1953;79;Wild One, The;Drama;Brando, Marlon;O'Malley, Pat;Benedek, Laslo;26;No;brando.png
1929;129;Manxman, The;Drama;Brisson, Carl;Ondra, Anny;Hitchcock, Alfred;65;No;alfredHitchcock.png
1978;126;International Velvet;Drama;Hopkins, Anthony;O'Neal, Tatum;Forbes, Bryan;40;No;AnthonyHopkins.png
1981;104;Scanners;Horror;Lack, Stephen;O'Neill, Jennifer;Cronenberg, David;32;No;NicholasCage.png
1986;98;Trick or Treat;Horror;Price, Marc;Orgolini, Lisa;Smith, Charles Martin;47;No;NicholasCage.png
1982;92;48 Hrs.;Action;Nolte, Nick;O'Toole, Annette;Hill, Walter;67;No;NicholasCage.png
1985;108;Trip to Bountiful, The;Drama;Heard, John;Page, Geraldine;Masterson, Peter;62;Yes;NicholasCage.png
1955;116;Mister Roberts;Comedy;Fonda, Henry;Palmer, Betsy;Ford, John;8;Yes;johnFord.png
1969;127;Z;Drama;Montand, Yves;Papas, Irene;Costa-Gavras;72;Yes;NicholasCage.png
1987;139;Maurice;Drama;Wilby, James;Parfitt, Judy;Ivory, James;45;No;NicholasCage.png
1969;114;Hamlet;Drama;Williamson, Nicol;Parfitt, Judy;Richardson, Tony;39;No;NicholasCage.png
1991;117;La Femme Nikita;Drama;Karyo, Tcheky;Parillaud, Anne;Besson, Luc;6;No;NicholasCage.png
1993;95;Honeymoon in Vegas;Comedy;Caan, James;Parker, Sarah Jessica;Bergman, Andrew;53;No;NicholasCage.png
1988;90;Going for the Gold;Action;Edwards, Anthony;Parker, Sarah Jessica;Taylor, Dan;10;No;clintEastwood.png
1976;128;Shout at the Devil;Action;Marvin, Lee;Parkins, Barbara;Hunt, Peter R.;0;No;NicholasCage.png
1986;94;A Smoky Mountain Christmas;Music;Majors, Lee;Parton, Dolly;Winkler, Henry;23;No;NicholasCage.png
1984;95;Getting Physical;Drama;Naughton, David;Paul, Alexandra;Stern, Steven Hilliard;75;No;NicholasCage.png
1990;95;Torn Apart;Drama;Pasdar, Adrian;Peck, Cecilia;Fisher, Jack;8;No;NicholasCage.png
1986;112;From the Hip;Comedy;Nelson, Judd;Perkins, Elizabeth;Clark, Bob;36;No;NicholasCage.png
1984;102;Ratings Game, The;Comedy;DeVito, Danny;Perlman, Rhea;DeVito, Danny;21;No;NicholasCage.png
1992;100;Class Act;Drama;Reid, Christopher;Perlman, Rhea;;88;No;NicholasCage.png
1986;89;Water;Comedy;Caine, Michael;Perrine, Valerie;Clement, Dick;47;No;NicholasCage.png
1978;88;Silent Movie;Comedy;Brooks, Mel;Peters, Bernadette;Brooks, Mel;27;No;NicholasCage.png
1989;122;Pink Cadillac;Comedy;Eastwood, Clint;Peters, Bernadette;Eastwood, Clint;12;No;clintEastwood.png
1979;94;Jerk, The;Comedy;Martin, Steve;Peters, Bernadette;Reiner, Carl;22;No;NicholasCage.png
1980;180;Wild Times;;Elliott, Sam;Peyser, Penny;Compton, Richard;75;No;NicholasCage.png
1986;107;Sweet Liberty;Comedy;Alda, Alan;Pfeiffer, Michelle;Alda, Alan;12;No;MichellePfeiffer.png
1982;115;Grease II;Music;Caulfield, Maxwell;Pfeiffer, Michelle;Birch, Patricia;64;No;MichellePfeiffer.png
1989;104;Married to the Mob;Comedy;Modine, Matthew;Pfeiffer, Michelle;Demme, Jonathan;8;No;MichellePfeiffer.png
1985;121;Ladyhawke;Adventure;Broderick, Matthew;Pfeiffer, Michelle;Donner, Richard;68;No;MichellePfeiffer.png
1989;114;Fabulous Baker Boys, The;Drama;Bridges, Jeff;Pfeiffer, Michelle;Kloves, Steve;66;No;MichellePfeiffer.png
1985;115;Into the Night;Comedy;Goldblum, Jeff;Pfeiffer, Michelle;Landis, John;62;No;MichellePfeiffer.png
1991;124;Russia House, The;Drama;Connery, Sean;Pfeiffer, Michelle;Schepisi, Fred;3;No;MichellePfeiffer.png
1988;116;Tequila Sunrise;Mystery;Gibson, Mel;Pfeiffer, Michelle;Towne, Robert;50;No;MichellePfeiffer.png
1989;74;B. A. D. Cats;Action;Morrow, Vic;Pfeiffer, Michelle;;87;No;MichellePfeiffer.png
1971;108;Last Movie, The;Drama;Hopper, Dennis;Phillips, Michelle;Hopper, Dennis;22;No;NicholasCage.png
1973;106;Dillinger;Drama;Oates, Warren;Phillips, Michelle;Milius, John;83;No;NicholasCage.png
1988;360;Little Dorrit;Drama;Jacobi, Derek;Pickering, Sarah;Edzard, Christine;12;No;NicholasCage.png
1927;78;My Best Girl;Drama;Rogers, Charles;Pickford, Mary;Taylor, Sam;31;No;NicholasCage.png
1989;93;Seizure;Horror;Frid, Jonathan;Pickles, Christina;Stone, Oliver;59;No;NicholasCage.png
1990;89;A Chorus of Disapproval;Comedy;Irons, Jeremy;Pigg, Alexandra;Winner, Michael;0;No;NicholasCage.png
1962;119;Rome Adventure;Drama;Donahue, Tony;Pleshette, Suzanne;Daves, Delmer;39;No;NicholasCage.png
1992;121;Drowning by Numbers;Mystery;Hill, Bernard;Plowright, Joan;Greenaway, Peter;28;No;NicholasCage.png
1991;88;Born to Ride;Action;Stamos, John;Polo, Teri;Baker, Graham;59;No;NicholasCage.png
1988;94;Her Alibi;Comedy;Selleck, Tom;Porizkova, Paulina;Beresford, Bruce;80;No;NicholasCage.png
1988;96;Glitz;Mystery;Smits, Jimmy;Post, Markie;;9;No;NicholasCage.png
1990;95;Dangerous Pursuit;Mystery;Harrison, Gregory;Powers, Alexandra;Stern, Sandor;88;No;NicholasCage.png
1962;123;Experiment in Terror;Mystery;Ford, Glenn;Powers, Stefanie;Edwards, Blake;77;No;glennFord.png
1972;105;Hideaways, The;Comedy;Doran, Johnny;Prager, Sally;Cook, Fielder;42;No;NicholasCage.png
1965;108;What's New Pussycat;Comedy;O'Toole, Peter;Prentiss, Paula;Donner, Clive;83;No;NicholasCage.png
1965;108;What's New Pussycat?;Comedy;Sellers, Peter;Prentiss, Paula;Donner, Clive;46;No;NicholasCage.png
1983;98;Packin' It In;Comedy;Benjamin, Richard;Prentiss, Paula;Taylor, Jud;8;No;NicholasCage.png
1988;90;Naked Gun: From the Files of Police Squad!, THe;Comedy;Nielsen, Leslie;Presley, Priscilla;Zucker, David;9;No;NicholasCage.png
1990;106;In Too Deep;Drama;Race, Hugo;Press, Santha;Tatoulis, Colin South, John;50;No;NicholasCage.png
1988;107;Twins;Comedy;Schwarzenegger, Arnold;Preston, Kelly;Reitman, Ivan;23;No;NicholasCage.png
1988;94;Experts, The;Comedy;Travolta, John;Preston, Kelly;Thomas, Dave;67;No;NicholasCage.png
1989;94;Naked Lie;Drama;Lucking, William;Principal, Victoria;Colla, Richard A.;7;No;NicholasCage.png
1987;87;Mistress;Drama;Rachins, Allan;Principal, Victoria;Tuchner, Michael;36;No;NicholasCage.png
1992;;Pleasure Palace;Action;Sharif, Omar;Principal, Victoria;;45;No;NicholasCage.png
1970;100;Adam at 6 A.M.;Drama;Douglas, Michael;Purcell, Lee;Scheerer, Robert;3;No;NicholasCage.png
1990;93;Web of Deceit;Drama;Read, James;Purl, Linda;Stern, Sandor;6;No;NicholasCage.png
1991;119;New York Stories;Comedy;Allen, Woody;Questel, Mae;Coppola, Francis Ford;6;No;NicholasCage.png
1987;97;Dreams Lost, Dreams Found;Drama;Robb, David;Quinlan, Kathleen;Patterson, Willi;66;No;NicholasCage.png
1987;103;Au Revoir les Enfants;Drama;Manesse, Gaspard;Racette, Francine;Malle, Louis;35;No;NicholasCage.png
1989;122;Quo Vadis;Drama;Brandauer, Klaus Maria;Raines, Cristina;Rossi, Franco;6;No;NicholasCage.png
1949;100;Fighting Kentuckian, The;Action;Wayne, John;Ralston, Vera;Waggner, George;74;No;johnWayne.png
1974;104;Zardoz;Science Fiction;Connery, Sean;Rampling, Charlotte;Boorman, John;6;No;seanConnery.png
1989;84;Police Academy 6: City under Siege;Comedy;Smith, Bubba;Ramsey, Marion;Bonerz, Peter;29;No;NicholasCage.png
1988;90;Police Academy 5: Assignment Miami Beach;Comedy;Gaynes, George;Ramsey, Marion;Myerson, Alan;59;No;NicholasCage.png
1986;84;Police Academy 3: Back in Training;Comedy;Guttenberg, Steve;Ramsey, Marion;Paris, Jerry;6;No;NicholasCage.png
1991;60;America's Music, Blues;Music;Hopkins, Linda;Redd, Vi;Walton, Kip;54;No;NicholasCage.png
1977;100;Julia;Drama;Fonda, Vanessa;Redgrave, Jane;Zinnemann, Fred;75;Yes;NicholasCage.png
1971;111;Devils, The;Drama;Reed, Oliver;Redgrave, Vanessa;Russell, Ken;69;No;NicholasCage.png
1984;90;Ransom;Drama;Ford, Glenn;Reed, Donna;Segal, Alex;73;No;glennFord.png
1990;97;Cadillac Man;Comedy;Williams, Robin;Reed, Pamela;Donaldson, Roger;28;No;NicholasCage.png
1986;104;Best of Times, The;Comedy;Williams, Robin;Reed, Pamela;Spottiswoode, Roger;88;No;NicholasCage.png
1985;135;Death of a Salesman;Drama;Hoffman, Dustin;Reid, Kate;Schlöndorff, Volker;13;No;NicholasCage.png
1993;104;It Started with a Kiss;Drama;Ford, Glenn;Reynolds, Debbie;;80;No;glennFord.png
1989;88;Money, The;Drama;Luckinbill, Laurence;Richards, Elizabeth;;29;No;NicholasCage.png
1987;153;Empire of the Sun;Drama;Malkovich, John;Richardson, Miranda;Spielberg, Steven;6;No;NicholasCage.png
1991;102;Comfort of Strangers, The;Mystery;Walken, Christopher;Richardson, Natasha;Schrader, Paul;5;No;NicholasCage.png
1969;135;On Her Majesty's Secret Service;Action;Lazenby, George;Rigg, Diana;Hunt, Peter R.;66;No;NicholasCage.png
1986;96;Pretty in Pink;Drama;Stanton, Harry Dean;Ringwald, Molly;Deutch, Howard;75;No;NicholasCage.png
1987;90;PK. & the Kid.;Drama;LeMat, Paul;Ringwald, Molly;;49;No;NicholasCage.png
1943;60;Lone Star Trail, The;Western;Brown, Johnny Mack;Ritter, Tex;Taylor, Ray;27;No;NicholasCage.png
1986;98;Summer;Comedy;Gauthier, Vincent;Riviere, Marie;Rohmer, Eric;11;No;NicholasCage.png
1987;93;Planes, Trains & Automobiles;Comedy;Martin, Steve;Robbins, Laila;Hughes, John;73;No;NicholasCage.png
1990;119;Pretty Woman;Comedy;Gere, Richard;Roberts, Julia;Marshall, Garry;43;No;NicholasCage.png
1991;111;Flatliners;Drama;Sutherland, Kiefer;Roberts, Julia;Schumacher, Joel;19;No;NicholasCage.png
1991;142;Hook;Action;Williams, Robin;Roberts, Julia;Spielberg, Steven;4;No;NicholasCage.png
1940;56;Riders of Pasco Basin;Western;Brown, Johnny Mack;Robinson, Frances;Taylor, Ray;17;No;NicholasCage.png
1992;53;Gotta Dance, Gotta Sing;Music;Astaire, Fred;Rogers, Ginger;;20;No;NicholasCage.png
1990;106;Desperate Hours;Mystery;Rourke, Mickey;Rogers, Mimi;Cimino, Michael;58;No;NicholasCage.png
1986;111;Gung Ho;Comedy;Keaton, Michael;Rogers, Mimi;Howard, Ron;59;No;NicholasCage.png
1992;96;Shooting Elizabeth;Mystery;Goldblum, Jeff;Rogers, Mimi;Taylor, Baz;5;No;NicholasCage.png
1951;101;Strangers on a Train;Mystery;Granger, Farley;Roman, Ruth;Hitchcock, Alfred;17;No;alfredHitchcock.png
1979;198;Sacketts, The;Western;Elliott, Sam;Roman, Ruth;Totten, Robert;86;No;NicholasCage.png
1991;87;To Die Standing;Action;De Young, Cliff;Rose, Jamie;Morneau, Louis;53;No;NicholasCage.png
1980;92;Rodeo Girl;Drama;Hopkins, Bo;Ross, Katharine;Cooper, Jackie;80;No;NicholasCage.png
1969;110;Butch Cassidy & the Sundance Kid;Western;Newman, Paul;Ross, Katharine;Hill, George Roy;29;Yes;paulNewman.png
1968;121;Hellfighters;Action;Wayne, John;Ross, Katharine;McLaglen, Andrew V.;22;No;johnWayne.png
1980;92;Final Countdown, The;Action;Douglas, Kirk;Ross, Katharine;Taylor, Don;35;No;NicholasCage.png
1986;120;Blue Velvet;Mystery;MacLachlan, Kyle;Rossellini, Isabella;Lynch, David;6;No;lynch.png
1989;110;Cousins;Comedy;Danson, Ted;Rossellini, Isabella;;28;No;NicholasCage.png
1976;90;Black & White in Color;Comedy;Carmet, Jean;Rouvel, Catherine;Annaud, Jean-Jacques;24;Yes;NicholasCage.png
1988;81;Another Woman;Drama;Hackman, Gene;Rowlands, Gena;Allen, Woody;7;No;woody.png
1992;128;Night on Earth;Drama;Benigni, Roberto;Rowlands, Gena;Jarmusch, Jim;24;No;NicholasCage.png
1988;92;Permanent Record;Drama;Boyce, Alan;Rubin, Jennifer;Silver, Marisa;42;No;NicholasCage.png
1992;138;Fisher King, The;Drama;Williams, Robin;Ruehl, Mercedes;Gilliam, Terry;8;Yes;NicholasCage.png
1991;98;Another You;Comedy;Pryor, Richard;Ruehl, Mercedes;Phillips, Maurice;75;No;NicholasCage.png
1958;167;Young Lions, The;Drama;Brando, Marlon;Rush, Barbara;Dmytryk, Edward;10;No;NicholasCage.png
1988;89;Cheerleader Camp;Horror;Garrett, Leif;Russell, Betsy;Quinn, John;79;No;NicholasCage.png
1990;98;Trapper County War;Action;Hudson, Ernie;Russell, Betsy;;5;No;NicholasCage.png
1947;96;Angel & the Badman;Western;Wayne, John;Russell, Gail;Grant, James Edward;84;No;johnWayne.png
1990;109;Impulse;Mystery;Fahey, Jeff;Russell, Theresa;Locke, Sondra;23;No;NicholasCage.png
1988;91;Track Twenty-Nine;Drama;Oldman, Gary;Russell, Theresa;Roeg, Nicolas;48;No;NicholasCage.png
1991;110;Freejack;Action;Estevez, Emilio;Russo, Rene;Richardson, Tony;26;No;NicholasCage.png
1939;109;John Wayne Matinee Double Feature, No. 1;Western;Wayne, John;Rutherford, Ann;;30;No;johnWayne.png
1988;81;Smallest Show on Earth, The;Comedy;Sellers, Peter;Rutherford, Margaret;Dearden, Basil;24;No;NicholasCage.png
1987;120;Innerspace;Science Fiction;Quaid, Dennis;Ryan, Meg;Dante, Joe;41;No;NicholasCage.png
1988;97;Presidio, The;Action;Connery, Sean;Ryan, Meg;Hyams, Peter;4;No;seanConnery.png
1990;102;Joe Versus the Volcano;Comedy;Hanks, Tom;Ryan, Meg;Patrick, John;17;No;NicholasCage.png
1991;135;Doors, The;Drama;Kilmer, Val;Ryan, Meg;Stone, Oliver;60;No;NicholasCage.png
1990;98;Welcome Home, Roxy Carmichael;Comedy;Daniels, Jeff;Ryder, Winona;Abrahams, Jim;41;No;NicholasCage.png
1972;99;Cancel My Reservation;Comedy;Hope, Bob;Saint, Eva Marie;Bogart, Paul;60;No;NicholasCage.png
1991;135;North by Northwest;Mystery;Grant, Cary;Saint, Eva Marie;Hitchcock, Alfred;20;No;alfredHitchcock.png
1966;127;Russians Are Coming, the Russians Are, The;Comedy;Reiner, Carl;Saint, Eva Marie;Jewison, Norman;79;Yes;NicholasCage.png
1992;213;Exodus;Drama;Newman, Paul;Saint, Eva Marie;Preminger, Otto;13;No;paulNewman.png
1982;128;Ballad of Narayama, The;Drama;Ogata, Ken;Sakamoto, Sumiko;Imamura, Shohei;88;No;NicholasCage.png
1985;96;Out of the Darkness;Mystery;Sheen, Martin;Salt, Jennifer;Taylor, Jud;86;No;NicholasCage.png
1971;90;Garden of the Finzi-Continis, The;Drama;Capolicchio, Lino;Sanda, Dominique;De Sica, Vittorio;42;Yes;NicholasCage.png
1974;105;Steppenwolf;Drama;Sydow, Max von;Sanda, Dominique;Haines, Fred;20;No;NicholasCage.png
1973;100;Mackintosh Man, The;Action;Newman, Paul;Sanda, Dominique;Huston, John;65;No;paulNewman.png
1968;105;Partner;Drama;Clementi, Pierre;Sandrelli, Stefania;Bertolucci, Bernardo;26;No;NicholasCage.png
1970;107;Conformist, The;Drama;Trintignant, Jean-Louis;Sandrelli, Stefania;Bertolucci, Bernardo;72;No;NicholasCage.png
1971;102;Dirty Harry;Drama;Eastwood, Clint;Santoni, Reni;Siegel, Don;72;No;clintEastwood.png
1986;103;Ferris Bueller's Day Off;Comedy;Broderick, Matthew;Sara, Mia;Hughes, John;12;No;NicholasCage.png
1986;89;Legend;Science Fiction;Cruise, Tom;Sara, Mia;Scott, Ridley;42;No;NicholasCage.png
1984;110;Buddy System, The;Drama;Dreyfuss, Richard;Sarandon, Susan;Jordan, Glenn;48;No;NicholasCage.png
1989;97;A Dry White Season;Drama;Sutherland, Donald;Sarandon, Susan;Palcy, Euzhan;71;No;NicholasCage.png
1975;105;Rocky Horror Picture Show, The;Music;Gray, Charles;Sarandon, Susan;Sharman, Jim;59;No;NicholasCage.png
1968;360;War & Peace;Drama;Tikhonov, Vyacheslav;Savelyeva, Lyudmila;Bondarchuk, Sergei;80;Yes;NicholasCage.png
1992;96;Defense of the Realm;Drama;Elliott, Denholm;Scacchi, Greta;;79;No;NicholasCage.png
1991;90;Basil The Rat;Comedy;Cleese, John;Scales, Prunella;;9;No;NicholasCage.png
1979;90;Fawlty Towers, Gourmet Night, Waldorf Salad & The Kipper & the Corpse;Comedy;Cleese, John;Scales, Prunella;;46;No;NicholasCage.png
1991;80;Going Under;Comedy;Pullman, Bill;Schaal, Wendy;Travis, Mark W.;30;No;NicholasCage.png
1990;83;U S. Sub Standard.;Comedy;Pullman, Bill;Schaal, Wendy;;27;No;NicholasCage.png
1990;;Hells Angels on Wheels;Action;Nicholson, Jack;Scharf, Sabrina;;1;No;NicholasCage.png
1975;118;Passenger, The;Drama;Nicholson, Jack;Schneider, Maria;Antonioni, Michelangelo;32;No;JackNicholson.png
1973;127;Last Tango in Paris;Drama;Brando, Marlon;Schneider, Maria;Bertolucci, Bernardo;28;No;brando.png
1987;155;Indigo Autumn & Lilac Dream;Drama;Singer, Marc;Schrage, Lisa;Gillard, Stuart;72;No;NicholasCage.png
1924;95;Kriemhild's Revenge, The Nibelungenlied;Drama;Loos, Theodor;Schön, Margarete;Lang, Fritz;74;No;NicholasCage.png
1966;102;Johnny Tiger;Drama;Taylor, Robert;Scott, Brenda;Wendkos, Paul;69;No;NicholasCage.png
1986;90;Head Office;Comedy;Reinhold, Judge;Seymour, Jane;Finkleman, Ken;88;No;NicholasCage.png
1990;;Live & Let Die;Action;Moore, Roger;Seymour, Jane;;62;No;NicholasCage.png
1972;100;Le Charme Discret de la Bourgeoisie;Comedy;Rey, Fernando;Seyrig, Delphine;Bunuel, Luis;4;Yes;NicholasCage.png
1986;83;Blue City;Action;Nelson, Judd;Sheedy, Ally;Manning, Michelle;38;No;NicholasCage.png
1983;123;Bad Boys;Drama;Penn, Sean;Sheedy, Ally;Rosenthal, Rick;7;No;NicholasCage.png
1986;82;Whoopee Boys, The;Comedy;O'Keefe, Michael;Shelley, Carole;Byrum, John;54;No;NicholasCage.png
1971;118;Last Picture Show, The;Drama;Bottoms, Timothy;Shepherd, Cybill;Bogdanovich, Peter;62;Yes;NicholasCage.png
1988;93;Diamond Trap, The;Drama;Hessman, Howard;Shields, Brooke;Taylor, Don;58;No;NicholasCage.png
1981;115;Endless Love;Drama;Hewitt, Martin;Shields, Brooke;Zeffirelli, Franco;20;No;NicholasCage.png
1976;90;Rocky;Drama;Stallone, Sylvester;Shire, Talia;Avildsen, John G.;78;Yes;NicholasCage.png
1988;103;Cocktail;Drama;Cruise, Tom;Shue, Elisabeth;Donaldson, Roger;13;No;NicholasCage.png
1936;77;Sabotage;Mystery;Homolka, Oskar;Sidney, Sylvia;Hitchcock, Alfred;74;No;alfredHitchcock.png
1977;105;Madame Rosa;Drama;Youb, Samy Ben;Signoret, Simone;Mizrahi, Moshe;11;Yes;NicholasCage.png
1985;56;Fozzie's Muppet Scrapbook;Comedy;Berle, Milton;Sills, Beverly;;86;No;NicholasCage.png
1954;110;Desiree;Drama;Brando, Marlon;Simmons, Jean;Koster, Henry;22;No;brando.png
1960;185;Spartacus;Drama;Douglas, Kirk;Simmons, Jean;Kubrick, Stanley;67;Yes;NicholasCage.png
1955;150;Guys & Dolls;Comedy;Brando, Marlon;Simmons, Jean;Mankiewicz, Joseph L.;70;Yes;brando.png
1992;95;Until They Sail;Drama;Newman, Paul;Simmons, Jean;Wise, Robert;77;No;paulNewman.png
1988;116;Coming to America;Comedy;Murphy, Eddie;Sinclair, Madge;Landis, John;11;No;NicholasCage.png
1963;93;Lilies of the Field;Drama;Poitier, Sidney;Skala, Lilia;Poitier, Sidney;36;Yes;NicholasCage.png
1987;99;River's Edge;Drama;Glover, Crispin;Skye, Ione;Hunter, Tim;3;No;NicholasCage.png
1986;93;Ruthless People;Comedy;DeVito, Danny;Slater, Helen;Abrahams, Jim;84;No;NicholasCage.png
1987;110;Secret of My Success, The;Comedy;Fox, Michael J.;Slater, Helen;Ross, Herbert;5;No;NicholasCage.png
1965;128;Shop on Main Street, The;Drama;Kroner, Josef;Slivoka, Hana;Kadar, Jan;37;Yes;NicholasCage.png
1988;101;Funny Farm;Comedy;Chase, Chevy;Smith, Madolyn;Hill, George Roy;30;No;NicholasCage.png
1988;120;Lonely Passion of Judith Hearne, The;Drama;Hoskins, Bob;Smith, Maggie;Clayton, Jack;24;No;NicholasCage.png
1978;103;California Suite;Comedy;Caine, Michael;Smith, Maggie;Ross, Herbert;11;Yes;NicholasCage.png
1986;97;Maximum Overdrive;Horror;Estevez, Emilio;Smith, Yeardley;King, Stephen;40;No;NicholasCage.png
1985;116;Pale Rider;Western;Eastwood, Clint;Snodgress, Carrie;Eastwood, Clint;45;No;clintEastwood.png
1990;88;Kissing Place, The;Drama;Birney, Meredith Baxter;Snow, Victoria;Wharmby, Tony;41;No;NicholasCage.png
1986;90;French Lesson;Comedy;Sterling, Alexandre;Snowden, Jane;Gilbert, Brian;29;No;NicholasCage.png
1985;88;Roller Blade;Action;Hutchinson, Jeff;Solari, Suzanne;Jackson, Donald G;31;No;NicholasCage.png
1964;101;A Shot in the Dark;Comedy;Sellers, Peter;Sommer, Elke;Edwards, Blake;51;No;NicholasCage.png
1979;88;Treasure Seekers, The;Action;Whitman, Stuart;Sommer, Elke;;2;No;NicholasCage.png
1982;122;Missing;Drama;Lemmon, Jack;Spacek, Sissy;Costa-Gavras;30;No;NicholasCage.png
1989;99;Picasso Trigger;Action;Bond, Steve;Speir, Dona;Sidaris, Andy;20;No;NicholasCage.png
1987;97;Hard Ticket to Hawaii;Action;Moss, Ronn;Speir, Dona;Sidaris, Andy;36;No;NicholasCage.png
1990;;Diamonds are Forever;Action;Connery, Sean;St. John, Jill;Hamilton, Guy;8;No;seanConnery.png
1933;72;Baby Face;Drama;Brent, George;Stanwyck, Barbara;Green, Alfred E.;66;No;NicholasCage.png
1992;95;Violent Men, The;Action;Ford, Glenn;Stanwyck, Barbara;Mate, Rudolph;25;No;glennFord.png
1985;117;Cocoon;Science Fiction;Ameche, Don;Stapleton, Maureen;Howard, Ron;45;Yes;NicholasCage.png
1986;96;Clockwise;Comedy;Cleese, John;Steadman, Alison;Morahan, Christopher;10;No;NicholasCage.png
1993;103;Romantic Comedy;Comedy;Moore, Dudley;Steenburgen, Mary;;8;No;NicholasCage.png
1981;111;Outland;Science Fiction;Connery, Sean;Sternhagen, Frances;Hyams, Peter;7;No;seanConnery.png
1967;114;Hang 'em High;Western;Eastwood, Clint;Stevens, Inger;Post, Ted;67;No;clintEastwood.png
1992;123;Basic Instinct;Mystery;Douglas, Michael;Stone, Sharon;Verhoeven, Paul;41;No;NicholasCage.png
1990;113;Total Recall;Action;Schwarzenegger, Arnold;Stone, Sharon;Verhoeven, Paul;8;No;NicholasCage.png
1987;115;Stakeout;Comedy;Dreyfuss, Richard;Stowe, Madeleine;Badham, John;13;No;NicholasCage.png
1992;104;Unnamable II, The Statement of Randolph Carter, The;Drama;Rhys-Davies, John;Strain, Julie;Ouellette, Jean-Paul;36;No;NicholasCage.png
1967;85;Trip, The;Drama;Fonda, Peter;Strasberg, Susan;Corman, Roger;64;No;NicholasCage.png
1987;135;Ironweed;Drama;Nicholson, Jack;Streep, Meryl;Babenco, Hector;32;No;merylStreep.png
1979;;Kramer vs. Kramer;Drama;Hoffman, Dustin;Streep, Meryl;Benton, Robert;8;Yes;merylStreep.png
1988;;Still of the Night;Mystery;Scheider, Roy;Streep, Meryl;Benton, Robert;42;No;merylStreep.png
1991;112;Defending Your Life;Comedy;Brooks, Albert;Streep, Meryl;Brooks, Albert;75;No;merylStreep.png
1978;183;Deer Hunter, The;Drama;De Niro, Robert;Streep, Meryl;Cimino, Michael;82;Yes;merylStreep.png
1984;106;Falling in Love;Drama;De Niro, Robert;Streep, Meryl;Grosbard, Ulu;31;No;merylStreep.png
1986;108;Heartburn;Comedy;Nicholson, Jack;Streep, Meryl;Nichols, Mike;57;No;JackNicholson.png
1983;131;Silkwood;Drama;Russell, Kurt;Streep, Meryl;Nichols, Mike;52;No;merylStreep.png
1982;151;Sophie's Choice;Drama;Kline, Kevin;Streep, Meryl;Pakula, Alan J.;64;Yes;merylStreep.png
1985;161;Out of Africa;Drama;Redford, Robert;Streep, Meryl;Pollack, Sydney;88;Yes;merylStreep.png
1981;127;French Lieutenant's Woman, The;Drama;Irons, Jeremy;Streep, Meryl;Reisz, Karel;37;No;merylStreep.png
1985;124;Plenty;Drama;Dance, Charles;Streep, Meryl;Schepisi, Fred;9;No;merylStreep.png
1988;122;A Cry in the Dark;Drama;Neill, Sam;Streep, Meryl;Schepisi, Fred;67;No;merylStreep.png
1989;99;She-Devil;Comedy;Begley, Ed, Jr.;Streep, Meryl;Seidelman, Susan;43;No;merylStreep.png
1992;103;Death Becomes Her;Drama;Willis, Bruce;Streep, Meryl;Zemeckis, Robert;61;No;merylStreep.png
1991;28;Kids & Pesticides;Drama;Whyatt, Robin;Streep, Meryl;;36;No;merylStreep.png
1970;129;On a Clear Day You Can See Forever;Music;Montand, Yves;Streisand, Barbra;Minnelli, Vincente;67;No;NicholasCage.png
1987;100;Nuts;Drama;Dreyfuss, Richard;Streisand, Barbra;Ritt, Martin;52;No;NicholasCage.png
1983;134;Yentl;Music;Patinkin, Mandy;Streisand, Barbra;Streisand, Barbra;46;No;NicholasCage.png
1968;151;Funny Girl;Music;Sharif, Omar;Streisand, Barbra;Wyler, William;30;Yes;NicholasCage.png
1990;97;Fellow Traveller;Drama;Travanti, Daniel J.;Stubbs, Imogen;Towns, Philip Saville;39;No;NicholasCage.png
1970;140;Dodesukaden;Drama;Zushi, Yoshitaka;Sugai, Kin;Kurosawa, Akira;75;No;NicholasCage.png
1987;;Sicilian, The;Drama;Lambert, Christopher;Sukowa, Barbara;Cimino, Michael;41;No;NicholasCage.png
1941;117;So Ends Our Night;Drama;March, Fredric;Sullavan, Margaret;Cromwell, John;2;No;NicholasCage.png
1984;102;Sword of the Valiant;Action;O'Keeffe, Miles;Sutton, Emma;Weeks, Stephen;5;No;NicholasCage.png
1949;78;Devil's Wanton, The;Drama;Malmsten, Birger;Svedlund, Doris;Bergman, Ingmar;66;No;Bergman.png
1989;99;Driving Miss Daisy;Drama;Freeman, Morgan;Tandy, Jessica;Beresford, Bruce;6;Yes;NicholasCage.png
1991;111;Seventh Cross, The;Drama;Tracy, Spencer;Tandy, Jessica;;35;No;spencerTracy.png
1983;105;Between Friends;Drama;Ramer, Henry;Taylor, Elizabeth;Antonio, Lou;54;No;elizabethTaylor.png
1957;173;Raintree County;Drama;Clift, Montgomery;Taylor, Elizabeth;Dmytryk, Edward;74;No;elizabethTaylor.png
1975;101;Driver's Seat, The;Drama;Bannen, Ian;Taylor, Elizabeth;Griffi, Giuseppe Patroni;72;No;elizabethTaylor.png
1967;109;Reflections in a Golden Eye;Drama;Brando, Marlon;Taylor, Elizabeth;Huston, John;81;No;elizabethTaylor.png
1972;110;X, Y & Zee;Drama;Caine, Michael;Taylor, Elizabeth;Hutton, Brian G.;87;No;elizabethTaylor.png
1968;109;Secret Ceremony;Drama;Mitchum, Robert;Taylor, Elizabeth;Losey, Joseph;60;No;elizabethTaylor.png
1963;243;Cleopatra;Drama;Burton, Richard;Taylor, Elizabeth;Mankiewicz, Joseph L.;80;No;elizabethTaylor.png
1950;;Father of the Bride;Comedy;Taylor, Rod;Taylor, Elizabeth;Minnelli, Vincente;54;No;elizabethTaylor.png
1992;130;Who's Afraid of Virginia Woolf?;Drama;Burton, Richard;Taylor, Elizabeth;Nichols, Mike;82;Yes;elizabethTaylor.png
1977;110;A Little Night Music;Music;Cariou, Len;Taylor, Elizabeth;Prince, Harold;61;No;elizabethTaylor.png
1956;201;Giant;Drama;Hudson, Rock;Taylor, Elizabeth;Stevens, George;61;Yes;elizabethTaylor.png
1985;94;Rumor Mill, The;Drama;Dysart, Richard A.;Taylor, Elizabeth;Trikonis, Gus;62;No;elizabethTaylor.png
1943;90;Lassie Come Home;Drama;McDowall, Roddy;Taylor, Elizabeth;Wilcox, Fred M;79;No;elizabethTaylor.png
1993;76;Return Engagement;Drama;Bottoms, Joseph;Taylor, Elizabeth;;26;No;elizabethTaylor.png
1972;108;Hammersmith Is Out;Drama;Burton, Richard;Taylor, Elizabeth;;80;No;elizabethTaylor.png
1991;60;Super Duper Bloopers;Comedy;Cooper, Gary;Taylor, Elizabeth;;21;No;elizabethTaylor.png
1991;;Elizabeth Taylor Collection, The;Drama;Fisher, Eddie;Taylor, Elizabeth;;21;No;elizabethTaylor.png
1973;99;Ash Wednesday;Drama;Fonda, Henry;Taylor, Elizabeth;;54;No;elizabethTaylor.png
1991;117;Last Time I Saw Paris, The;Drama;Johnson, Van;Taylor, Elizabeth;;13;No;elizabethTaylor.png
1931;125;Cimarron;Western;Dix, Richard;Taylor, Estelle;Ruggles, Wesley;44;Yes;NicholasCage.png
1992;83;Apache Woman;Western;Bridges, Lloyd;Taylor, Joan;Corman, Roger;32;No;NicholasCage.png
1984;;Gary Numan - Berzerker;Music;Webb, John;Taylor, Karen;;60;No;NicholasCage.png
1988;101;Mystic Pizza;Comedy;Moses, William;Taylor, Lili;Petrie, Donald;74;No;NicholasCage.png
1991;95;Dogfight;Action;Phoenix, River;Taylor, Lili;Savoca, Nancy;66;No;NicholasCage.png
1935;234;Adventures of Rex & Rinty, The;Western;Rex the Wonder Horse;Taylor, Norma;Beebe, Ford;87;No;NicholasCage.png
1988;60;Daphnis & Chloe;Music;Morrow, Carl;Taylor, Victoria;Wimhurst, Jolyon;85;No;NicholasCage.png
1980;97;Marathon;Comedy;Newhart, Bob;Taylor-Young, Leigh;Cooper, Jackie;76;No;NicholasCage.png
1948;127;Fort Apache;Western;Fonda, Henry;Temple, Shirley;Ford, John;4;No;johnFord.png
1937;100;Wee Willie Winkie;Drama;Romero, Cesar;Temple, Shirley;Ford, John;78;No;johnFord.png
1987;91;Big Shots;Action;Busker, Ricky;Thayer, Brynn;Mandel, Robert;5;No;NicholasCage.png
1988;85;Doin' Time on Planet Earth;Comedy;Strouse, Nocholas;Thompson, Andrea;Matthau, Charles;44;Yes;NicholasCage.png
1983;91;All the Right Moves;Drama;Cruise, Tom;Thompson, Lea;Chapman, Michael;65;No;NicholasCage.png
1987;93;Some Kind of Wonderful;Drama;Stoltz, Eric;Thompson, Lea;Deutch, Howard;16;No;NicholasCage.png
1990;87;All New Tales from the Crypt, A Trilogy;Horror;Walsh, M. Emmet;Thompson, Lea;Deutch, Howard;33;No;NicholasCage.png
1985;116;Back to the Future;Comedy;Fox, Michael J.;Thompson, Lea;Zemeckis, Robert;9;No;NicholasCage.png
1963;80;Winter Light;Drama;Björnstrand, Gunnar;Thulin, Ingrid;Bergman, Ingmar;2;No;Bergman.png
1963;95;Silence, The;Drama;Malmsten, Birger;Thulin, Ingrid;Bergman, Ingmar;79;No;Bergman.png
1959;100;Magician, The;Drama;Sydow, Max von;Thulin, Ingrid;Bergman, Ingmar;3;No;Bergman.png
1961;154;Four Horsemen of the Apocalypse, The;Drama;Ford, Glenn;Thulin, Ingrid;Minnelli, Vincente;71;No;glennFord.png
1986;99;Critical Condition;Comedy;Pryor, Richard;Ticotin, Rachel;Apted, Michael;41;No;NicholasCage.png
1989;88;Center of the Web;Mystery;Curtis, Tony;Tilton, Charlene;;42;No;NicholasCage.png
1990;110;Border Shootout;Action;Ford, Glenn;Tilton, Charlene;;7;No;glennFord.png
1989;109;Lean on Me;Drama;Freeman, Morgan;Todd, Beverly;Avildsen, John G.;51;No;NicholasCage.png
1986;221;On Wings of Eagles;Drama;Lancaster, Burt;Towers, Constance;McLaglen, Andrew V.;53;No;burtLancaster.png
1941;94;Texas;Western;Holden, William;Trevor, Claire;Marshall, George;79;No;NicholasCage.png
1939;80;Allegheny Uprising;Drama;Wayne, John;Trevor, Claire;Seiter, William A.;53;No;johnWayne.png
1940;95;Dark Command;Western;Wayne, John;Trevor, Claire;Walsh, Raoul;52;No;johnWayne.png
1986;103;Peggy Sue Got Married;Drama;Cage, Nicolas;Turner, Kathleen;Coppola, Francis Ford;62;No;NicholasCage.png
1989;84;Dear America, Letters Home from Vietnam;War;De Niro, Robert;Turner, Kathleen;Couturie, Bill;57;No;NicholasCage.png
1985;130;Prizzi's Honor;Comedy;Nicholson, Jack;Turner, Kathleen;Huston, John;25;Yes;JackNicholson.png
1983;90;Man with Two Brains, The;Comedy;Martin, Steve;Turner, Kathleen;Reiner, Carl;68;No;NicholasCage.png
1984;101;Crimes of Passion;Drama;Perkins, Anthony;Turner, Kathleen;Russell, Ken;4;No;NicholasCage.png
1985;106;Jewel of the Nile, The;Action;Douglas, Michael;Turner, Kathleen;Teague, Lewis;68;No;NicholasCage.png
1984;106;Romancing the Stone;Action;Douglas, Michael;Turner, Kathleen;Zemeckis, Robert ;83;No;NicholasCage.png
1988;121;Accidental Tourist, The;Comedy;Hurt, William;Turner, Kathleen;;56;Yes;NicholasCage.png
1955;117;Sea Chase, The;War;Wayne, John;Turner, Lana;Farrow, John;4;No;johnWayne.png
1958;98;Another Time, Another Place;Drama;Connery, Sean;Turner, Lana;;4;No;seanConnery.png
1988;90;Cannibal Women in the Avocado Jungle of Death;Comedy;Primus, Barry;Tweed, Shannon;Lawton, J.F.;56;No;NicholasCage.png
1986;91;Mr Love.;Comedy;Jackson, Barry;Tyzack, Margaret;Battersby, Roy;10;No;NicholasCage.png
1968;139;2001: A Space Odyssey;Science Fiction;Dullea, Keir;Tyzack, Margaret;Kubrick, Stanley;83;No;NicholasCage.png
1966;81;Persona;Drama;Björnstrand, Gunnar;Ullman, Liv;Bergman, Ingmar;81;Yes;Bergman.png
1973;;Scenes from a Marriage;Drama;Josephson, Erland;Ullman, Liv;Bergman, Ingmar;3;Yes;Bergman.png
1968;88;Hour of the Wolf;Drama;Sydow, Max von;Ullman, Liv;Bergman, Ingmar;37;No;Bergman.png
1969;101;Passion of Anna, The;Drama;Sydow, Max von;Ullman, Liv;Bergman, Ingmar;6;No;Bergman.png
1984;96;Dangerous Moves;Drama;Caron, Leslie;Ullman, Liv;Dembo, Richard;7;Yes;NicholasCage.png
1957;147;Sayonara;Drama;Brando, Marlon;Umeki, Miyoshi;Logan, Joshua;19;Yes;brando.png
1968;158;Where Eagles Dare;War;Burton, Richard;Ure, Mary;Hulton, Brian G.;57;No;NicholasCage.png
1985;95;Teen Wolf;Drama;Fox, Michael J.;Ursitti, Susan;Daniel, Rod;58;No;NicholasCage.png
1990;88;Amazon;Action;Davi, Robert;Vaananen, Kari;Kaurismäki, Mika;30;No;NicholasCage.png
1973;;Paper Chase, The;Drama;Bottoms, Timothy;Wagner, Lindsay;Bridges, James;7;Yes;NicholasCage.png
1959;88;Virgin Spring, The;Drama;Sydow, Max von;Valberg, Brigitta;Bergman, Ingmar;8;Yes;Bergman.png
1970;97;Spider's Stratagem;Drama;Brogi, Giulio;Valli, Alida;Bertolucci, Bernardo;45;No;NicholasCage.png
1971;102;Play Misty for Me;Mystery;Eastwood, Clint;Walter, Jessica;Eastwood, Clint;47;No;clintEastwood.png
1981;88;Going Ape;Comedy;Danza, Tony;Walter, Jessica;Kronsberg, Jeremy Joe;65;No;NicholasCage.png
1967;127;Cool Hand Luke;Drama;Newman, Paul;Van Fleet, Jo;Rosenberg, Stuart;49;Yes;paulNewman.png
1988;89;Phantom of the Ritz;Horror;Bergman, Peter;Van Valkenburgh, Deborah;Plone, Allen;85;No;NicholasCage.png
1990;85;Crash & Burn;Science Fiction;Ganus, Paul;Ward, Megan;Band, Charles;75;No;NicholasCage.png
1991;114;After Dark My Sweet;Mystery;Patric, Jason;Ward, Rachel;Foley, James;33;No;NicholasCage.png
1992;121;Christopher Columbus: The Discovery;Adventure;Brando, Marlon;Ward, Rachel;Glen, John;39;No;NicholasCage.png
1986;109;Young Sherlock Holmes;Mystery;Rowe, Nicholas;Ward, Sophie;Levinson, Barry;16;No;NicholasCage.png
1991;104;Doc Hollywood;Comedy;Fox, Michael J.;Warner, Julie;Caton-Jones, Michael;64;No;NicholasCage.png
1988;96;Baja Oklahoma;Comedy;Coyote, Peter;Warren, Lesley Ann;Roth, Bobby;71;No;NicholasCage.png
1986;137;Aliens;Science Fiction;Biehn, Michael;Weaver, Sigourney;Cameron, James;82;No;weaver.png
1992;115;Alien Three;Science Fiction;Dutton, Charles;Weaver, Sigourney;Fincher, David;59;No;weaver.png
1997;109;Alien: resurrection;Science Fiction;Perlman, Ron;Weaver, Sigourney;Jeunet, Jean-Pierre;60;No;weaver.png
1979;117;Alien;Science Fiction;Skerritt, Tom;Weaver, Sigourney;Scott, Ridley;83;No;weaver.png
1985;97;One Woman or Two;Comedy;Depardieu, Gérard;Weaver, Sigourney;Vigne, Daniel;64;No;weaver.png
1984;96;Soggy Bottom U. S. A.;Comedy;Johnson, Ben;Wedgeworth, Ann;Flicker, Theodore J.;50;No;NicholasCage.png
1973;96;Bang the Drum Slowly;Drama;Moriarty, Michael;Wedgeworth, Ann;Hancock, John D.;73;No;NicholasCage.png
1974;82;Catamount Killing, The;Action;Buchholz, Horst;Wedgeworth, Ann;Zanussi, Krzystoff;84;No;NicholasCage.png
1972;92;Fuzz;Action;Reynolds, Burt;Welch, Raquel;Colla, Richard A.;37;No;NicholasCage.png
1966;101;Shoot Loud, Louder, I Don't Understand!;Mystery;Mastroianni, Marcello;Welch, Raquel;De Filippo, Eduardo;70;No;NicholasCage.png
1967;107;Bedazzled;Comedy;Cook, Peter;Welch, Raquel;Donen, Stanley;67;No;NicholasCage.png
1977;120;Prince & the Pauper, The;Action;Reed, Oliver;Welch, Raquel;Fleischer, Richard;86;No;NicholasCage.png
1969;110;One Hundred Rifles;Western;Reynolds, Burt;Welch, Raquel;Gries, Tom;48;No;NicholasCage.png
1975;90;Wild Party, The;Drama;Dukes, David;Welch, Raquel;Ivory, James;75;No;NicholasCage.png
1968;106;Bandolero!;Western;Stewart, James;Welch, Raquel;McLaglen, Andrew V.;9;No;NicholasCage.png
1973;119;Last of Sheila, The;Mystery;Coburn, James;Welch, Raquel;Ross, Herbert;39;No;NicholasCage.png
1972;87;Hannie Caulder;Drama;Borgnine, Ernest;Welch, Raquel;;9;No;NicholasCage.png
1990;;Sounds of the Seventies...& the Beat Goes;Music;Jones, Tom;Welch, Raquel;;13;No;NicholasCage.png
1988;161;Bird;Drama;Whitaker, Forest;Venora, Diane;Eastwood, Clint;24;No;NicholasCage.png
1955;60;Meet Millie;Drama;Halop, Florence;Verdugo, Elena;;82;No;NicholasCage.png
1987;88;Hell Comes to Frogtown;Science Fiction;LeFlore, Julius;Verrell, Cec;Jackson, Donald G;74;No;NicholasCage.png
1966;126;Fortune Cookie, The;Comedy;Lemmon, Jack;West, Judi;Wilder, Billy;3;Yes;NicholasCage.png
1990;92;Sun Shines Bright, The;Action;Winninger, Charles;Whelan, Arleen;Ford, John;46;No;johnFord.png
1987;106;Squeeze, The;Action;Keach, Stacy;White, Carol;Apted, Michael;23;No;NicholasCage.png
1970;91;Start the Revolution Without Me;Comedy;Wilder, Gene;Whitelaw, Billie;Yorkin, Bud;62;No;NicholasCage.png
1989;107;Major League;Comedy;Sheen, Charlie;Whitton, Margaret;Ward, David S.;64;No;NicholasCage.png
1990;108;Bright Lights, Big City;Drama;Fox, Michael J.;Wiest, Dianne;Bridges, James;30;No;NicholasCage.png
1987;97;Lost Boys, The;Horror;Patric, Jason;Wiest, Dianne;Schumacher, Joel;67;No;NicholasCage.png
1989;93;Cookie;Comedy;Falk, Peter;Wiest, Dianne;Seidelman, Susan;43;No;NicholasCage.png
1974;114;Conversation, The;Drama;Hackman, Gene;Williams, Cindy;Coppola, Francis Ford;59;Yes;NicholasCage.png
1973;112;American Graffiti;Comedy;Dreyfuss, Richard;Williams, Cindy;Lucas, George;39;Yes;NicholasCage.png
1953;96;Dangerous When Wet;Music;Lamas, Fernando;Williams, Esther;Walters, Charles;67;No;NicholasCage.png
1980;111;Stir Crazy;Comedy;Pryor, Richard;Williams, JoBeth;Poitier, Sidney;40;No;NicholasCage.png
1989;91;Young Einstein;Comedy;Serious, Yahoo;Wilson, Pee-Wee;Serious, Yahoo;47;No;NicholasCage.png
1956;83;Killing, The;Drama;Hayden, Sterling;Windsor, Marie;Kubrick, Stanley;51;No;NicholasCage.png
1973;102;Cahill, United States Marshal;Western;Wayne, John;Windsor, Marie;McLaglen, Andrew V.;12;No;johnWayne.png
1989;90;Savage Intruder, The;Horror;Garfield, John David;Wing, Virginia;Wolfe, Donald;24;No;NicholasCage.png
1992;139;Sheltering Sky, The;Drama;Malkovich, John;Winger, Debra;Bertolucci, Bernardo;64;No;NicholasCage.png
1982;125;An Officer & a Gentleman;Drama;Gere, Richard;Winger, Debra;Hackford, Taylor;1;Yes;NicholasCage.png
1987;101;Black Widow;Mystery;Hopper, Dennis;Winger, Debra;Rafelson, Bob;54;No;NicholasCage.png
1986;116;Legal Eagles;Comedy;Redford, Robert;Winger, Debra;Reitman, Ivan;39;No;NicholasCage.png
1970;90;Bloody Mama;Action;Stroud, Don;Winters, Shelley;Corman, Roger;17;No;NicholasCage.png
1965;106;A Patch of Blue;Drama;Poitier, Sidney;Winters, Shelley;Green, Guy;51;No;NicholasCage.png
1955;109;I Died a Thousand Times;Drama;Palance, Jack;Winters, Shelley;Heisler, Stuart;23;No;NicholasCage.png
1977;90;Tentacles;Horror;Huston, John;Winters, Shelley;Hellman, Oliver;62;No;NicholasCage.png
1968;100;Scalphunters, The;Western;Lancaster, Burt;Winters, Shelley;Pollack, Sydney;33;No;burtLancaster.png
1992;96;A Day in October;Drama;Sweeney, D. B.;Wolf, Kelly;Madsen, Kenneth;76;No;NicholasCage.png
1964;102;A Fistful of Dollars;Westerns;Eastwood, Clint;Volonte, Gian Maria;Leone, Sergio;61;No;clintEastwood.png
1985;94;My Science Project;Comedy;Stockwell, John;Von Zerneck, Danielle;Betnel, Jonathan;84;No;NicholasCage.png
1991;160;Great Race, The;Comedy;Moore, Dudley;Wood, Natalie;Edwards, Blake;88;No;NicholasCage.png
1956;119;Searchers, The;Western;Wayne, John;Wood, Natalie;Ford, John;9;No;johnWayne.png
1979;105;Meteor;Action;Connery, Sean;Wood, Natalie;Neame, Ronald;5;No;seanConnery.png
1955;111;Rebel Without a Cause;Drama;Dean, James;Wood, Natalie;Ray, Nicholas;82;No;NicholasCage.png
1961;153;West Side Story;Music;Beymer, Richard;Wood, Natalie;Wise, Robert;38;Yes;NicholasCage.png
1970;110;Trash;Comedy;Dallesandro, Joe;Woodlawn, Holly;Morrissey, Paul;68;No;NicholasCage.png
1966;95;A Big Hand for the Little Lady;Comedy;Fonda, Henry;Woodward, Joanne;Cook, Fielder;12;No;NicholasCage.png
1966;104;A Fine Madness;Comedy;Connery, Sean;Woodward, Joanne;Kershner, Irvin;6;No;seanConnery.png
1987;134;Glass Menagerie, The;Drama;Malkovich, John;Woodward, Joanne;Newman, Paul;68;No;NicholasCage.png
1989;117;Harry & Son;Drama;Newman, Paul;Woodward, Joanne;Newman, Paul;57;No;paulNewman.png
1968;102;Rachel, Rachel;Drama;Olson, James;Woodward, Joanne;Newman, Paul;32;No;NicholasCage.png
1961;98;Paris Blues;Drama;Newman, Paul;Woodward, Joanne;Ritt, Martin;54;No;paulNewman.png
1960;135;Fugitive Kind, The;Drama;Brando, Marlon;Woodward, Joanne;;3;No;brando.png
1993;;Mr. & Mrs. Bridge;Drama;Newman, Paul;Woodward, Joanne;;29;No;paulNewman.png
1991;144;State of Grace;Drama;Penn, Sean;Wright, Robin;Joanou, Phil;49;No;NicholasCage.png
1943;108;Shadow of a Doubt;Drama;Cotten, Joseph;Wright, Teresa;Hitchcock, Alfred;32;No;alfredHitchcock.png
1950;85;Men, The;Drama;Brando, Marlon;Wright, Teresa;Zinnemann, Fred;27;No;brando.png
1950;110;Stage Fright;Mystery;Wilding, Michael;Wyman, Jane;Hitchcock, Alfred;72;No;alfredHitchcock.png
1947;103;Magic Town;Drama;Stewart, James;Wyman, Jane;Wellman, William;4;No;NicholasCage.png
1975;93;That Lucky Touch;Action;Moore, Roger;York, Susannah;Miles, Christopher;85;No;NicholasCage.png
1949;90;Lust for Gold;Drama;Ford, Glenn;Young, Gig;Simon, S. Sylvan;57;No;glennFord.png
1987;103;Heat;Mystery;Reynolds, Burt;Young, Karen;Jameson, Jerry;69;No;NicholasCage.png
1993;75;Employee's Entrance;Drama;William, Warren;Young, Loretta;;0;No;NicholasCage.png
1947;87;Night Is My Future;Drama;Malmsten, Birger;Zetterling, Mai;Bergman, Ingmar;17;No;Bergman.png
1990;92;Witches, The;Science Fiction;Fisher, Jasen;Zetterling, Mai;Roeg, Nicolas;18;No;NicholasCage.png
1953;94;Vera Cruz;Action;Cooper, Gary;;Aldrich, Robert;71;No;NicholasCage.png
1954;91;Apache;Western;Lancaster, Burt;;Aldrich, Robert;78;No;burtLancaster.png
1977;146;Twilight's Last Gleaming;Drama;Lancaster, Burt;;Aldrich, Robert;84;No;burtLancaster.png
1979;119;Frisco Kid, The;Comedy;Wilder, Gene;;Aldrich, Robert;10;No;NicholasCage.png
1954;30;Bank on the Stars;Drama;Paar, Jack;;Allen, Craig;;No;NicholasCage.png
1987;100;Law of Desire;Drama;Maura, Carmen;;Almodóvar, Pedro;73;No;NicholasCage.png
1966;103;Quiller Memorandum, The;Mystery;Segal, George;;Anderson, Michael;34;No;NicholasCage.png
1962;183;Longest Day, The;War;Wayne, John;;Annakin, Ken;7;No;johnWayne.png
1986;128;Name of the Rose, The;Drama;Connery, Sean;;Annaud, Jean-Jacques;8;No;seanConnery.png
1988;92;Bloodsport;Action;Van Damme, Jean-Claude;;Arnold, Newt;78;No;NicholasCage.png
1986;85;Torment;Horror;Gilbert, Taylor;;Aslanian, Samson;8;No;NicholasCage.png
1988;138;Pelle the Conqueror;Drama;Sydow, Max von;;August, Bille;14;Yes;NicholasCage.png
1981;118;Taps;Drama;Hutton, Timothy;;Becker, Harold;84;No;NicholasCage.png
1991;102;Freshman, The;Comedy;Brando, Marlon;;Bergman, Andrew;32;No;brando.png
1987;164;Last Emperor, The;Drama;Lone, John;;Bertolucci, Bernardo;1;Yes;NicholasCage.png
1962;100;Grim Reaper, The;Drama;Rulu, Francesco;;Bertolucci, Bernardo;35;No;NicholasCage.png
1983;90;Le Dernier Combat;Drama;Jolivet, Pierre;;Besson, Luc;72;No;NicholasCage.png
1989;91;Too Beautiful for You;Drama;Depardieu, Gérard;;Blier, Bertrand;35;No;NicholasCage.png
1991;105;Fire, Ice & Dynamite;Action;Moore, Roger;;Bogner, Willy;72;No;NicholasCage.png
1963;113;Heavens Above;Comedy;Sellers, Peter;;Boulting, John;38;No;NicholasCage.png
1961;141;One Eyed Jacks;Western;Malden, Karl;;Brando, Marlon;26;No;brando.png
1937;61;Swing It, Sailor!;Comedy;Ford, Wallace;;Cannon, Raymond;83;No;NicholasCage.png
1987;94;Wolf at the Door, The;Drama;Sutherland, Donald;;Carlsen, Henning;68;No;NicholasCage.png
1936;87;Modern Times;Comedy;Chaplin, Charles;;Chaplin, Charles;4;No;NicholasCage.png
1991;114;Thunderbolt & Lightfoot;Action;Eastwood, Clint;;Cimino, Michael;16;No;clintEastwood.png
1931;87;A Nous la Liberte;Drama;Marchand, Henri;;Clair, Rene;60;No;NicholasCage.png
1979;95;Scum;Action;Winstone, Ray;;Clarke, Alan;68;No;NicholasCage.png
1984;90;Inside Man, The;Action;Hopper, Dennis;;Clegg, Tom;45;No;NicholasCage.png
1979;153;Apocalypse Now;Drama;Brando, Marlon;;Coppola, Francis Ford;8;No;brando.png
1990;94;Bellboy & the Playgirls, The;Drama;Wilkinson, June;;Coppola, Francis Ford;7;No;NicholasCage.png
1963;81;Terror, The;Horror;Karloff, Boris;;Corman, Roger;88;No;NicholasCage.png
1963;86;Raven, The;Horror;Price, Vincent;;Corman, Roger;85;No;NicholasCage.png
1975;87;They Came from Within;Horror;Hampton, Paul;;Cronenberg, David;21;No;NicholasCage.png
1986;97;Boy in Blue, The;Drama;Cage, Nicolas;;Dale, Cynthia;63;No;NicholasCage.png
1991;87;Killer Tomatoes Strike Back;Comedy;Astin, John;;De Bello, John;24;No;NicholasCage.png
1979;87;Attack of the Killer Tomatoes;Comedy;Wilson, George;;De Bello, John;47;No;NicholasCage.png
1987;119;Untouchables, The;Drama;Connery, Sean;;De Palma, Brian;7;Yes;seanConnery.png
1986;91;Wise Guys;Comedy;Piscopo, Joe;;De Palma, Brian;16;No;NicholasCage.png
1989;90;American Autobahn;Drama;Jalenak, Jan;;Degas, Andre;75;No;NicholasCage.png
1990;94;Final Alliance, The;Action;Hasselhoff, David;;Di Leo, Mario;10;No;NicholasCage.png
1984;130;Bounty, The;Drama;Gibson, Mel;;Donaldson, Roger;25;No;NicholasCage.png
1974;89;Little Prince, The;Music;Kiley, Richard;;Donen, Stanley;31;No;NicholasCage.png
1975;94;Posse;Western;Douglas, Kirk;;Douglas, Kirk;76;No;NicholasCage.png
1982;136;Firefox;Action;Eastwood, Clint;;Eastwood, Clint;64;No;clintEastwood.png
1987;91;Penitentiary III;Action;Kennedy, Leon Isaac;;Fanaka, Jamaa;82;No;NicholasCage.png
1993;;Ginger & Fred;Comedy;Mastroianni, Marcello;;Fellini, Federico;29;No;NicholasCage.png
1966;107;Wrong Box, The;Comedy;Mills, John;;Forbes, Bryan;40;No;NicholasCage.png
1990;86;Wagonmaster;Western;Johnson, Ben;;Ford, John;1;No;johnFord.png
1945;135;They Were Expendable;War;Montgomery, Robert;;Ford, John;88;No;johnFord.png
1991;125;Last Hurrah, The;Drama;Tracy, Spencer;;Ford, John;46;No;spencerTracy.png
1949;59;Law of the Golden West;Western;Hale, Monte;;Ford, Philip;1;No;NicholasCage.png
1949;60;Pioneer Marshal;Western;Hale, Monte;;Ford, Philip;8;No;NicholasCage.png
1949;60;Ranger of the Cherokee Strip;Western;Hale, Monte;;Ford, Philip;31;No;NicholasCage.png
1950;60;Vanishing Westerner;Western;Hale, Monte;;Ford, Philip;6;No;NicholasCage.png
1948;59;Bandits of Dark Canyon;Western;Lane, Allan;;Ford, Philip;72;No;NicholasCage.png
1948;60;Bold Frontiersman, The;Western;Lane, Allan;;Ford, Philip;18;No;NicholasCage.png
1948;59;Wild Frontier, The;Western;Lane, Allan;;Ford, Philip;61;No;NicholasCage.png
1968;73;Firemen's Ball, The;Comedy;Vostrcil, Jan;;Forman, Milos;8;No;NicholasCage.png
1983;112;Local Hero;Comedy;Riegert, Peter;;Forsyth, Bill;54;No;NicholasCage.png
1971;104;French Connection, The;Drama;Hackman, Gene;;Friedkin, William;88;Yes;NicholasCage.png
1985;114;To Live & Die in L. A.;Action;Stockwell, Dean;;Friedkin, William;70;No;NicholasCage.png
1961;113;Ferry to Hong Kong;Drama;Welles, Orson;;Gilbert, Lewis;77;No;NicholasCage.png
1983;69;Eddie Murphy, Delirious;Comedy;Murphy, Eddie;;Gower, Bruce;6;No;NicholasCage.png
1984;77;Secret Policeman's Private Parts, The;Comedy;Cleese, John;;Graef, Roger;36;No;NicholasCage.png
1958;83;Up the Creek;Comedy;Sellers, Peter;;Guest, Val;54;No;NicholasCage.png
1982;111;Yol;Drama;Akan, Tarik;;Guney, Yilmaz;53;No;NicholasCage.png
1989;150;Sara Dane;Drama;Hopkins, Harold;;Hardy, Rod;75;No;NicholasCage.png
1988;84;Night Tide;Drama;Muir, Gavin;;Harrington, Curtis;50;No;NicholasCage.png
1953;92;His Majesty O'Keefe;Action;Lancaster, Burt;;Haskin, Byron;3;No;burtLancaster.png
1960;122;North to Alaska;Western;Wayne, John;;Hathaway, Henry;31;No;johnWayne.png
1966;76;Flight to Fury;Action;Nicholson, Jack;;Hellman, Monte;70;No;NicholasCage.png
1966;82;Ride in the Whirlwind;Western;Nicholson, Jack;;Hellman, Monte;26;No;NicholasCage.png
1970;93;Powderkeg;Western;Taylor, Rod;;Heyes, Douglas;26;No;NicholasCage.png
1953;95;I Confess;Drama;Clift, Montgomery;;Hitchcock, Alfred;63;No;alfredHitchcock.png
1935;88;Thirty-Nine Steps, The;Science Fiction;Donat, Robert;;Hitchcock, Alfred;8;No;alfredHitchcock.png
1969;126;Topaz;Mystery;Forsythe, John;;Hitchcock, Alfred;12;No;alfredHitchcock.png
1930;95;Murder;Mystery;Marshall, Herbert;;Hitchcock, Alfred;50;No;alfredHitchcock.png
1954;123;Dial M for Murder;Mystery;Milland, Ray;;Hitchcock, Alfred;52;No;alfredHitchcock.png
1937;80;Young & Innocent;Mystery;Pilbeam, Nova;;Hitchcock, Alfred;43;No;alfredHitchcock.png
1976;95;Creature from Black Lake;Horror;Elam, Jack;;Houck, Joy, Jr.;88;No;NicholasCage.png
1981;124;Chariots of Fire;Drama;Cross, Ben;;Hudson, Hugh;6;Yes;NicholasCage.png
1982;81;Monty Python Live at the Hollywood Bowl;Comedy;Chapman, Graham;;Hughes, Terry;81;No;NicholasCage.png
1975;129;Man Who Would Be King, The;Drama;Connery, Sean;;Huston, John;6;No;seanConnery.png
1981;117;Victory;Drama;Stallone, Sylvester;;Huston, John;39;No;NicholasCage.png
1970;146;Kelly's Heroes;War;Eastwood, Clint;;Hutton, Brian G.;84;No;clintEastwood.png
1989;109;Next of Kin;Mystery;Swayze, Patrick;;Irvin, John;63;No;NicholasCage.png
1990;96;Chattahoochee;Drama;Oldman, Gary;;Jackson, Mick;30;No;NicholasCage.png
1985;82;Angelic Conversation, The;Comedy;Reynolds, Paul;;Jarman, Derek;41;No;NicholasCage.png
1986;107;Down by Law;Comedy;Waits, Tom;;Jarmusch, Jim;49;No;NicholasCage.png
1984;141;Killing Fields, The;Drama;Waterston, Sam;;Joffe, Roland;6;Yes;NicholasCage.png
1992;85;Survival Zone;Action;Ford, Terence;;Jones, Chris;25;No;NicholasCage.png
1979;94;Monty Python's Life of Brian;Comedy;Chapman, Graham;;Jones, Terry;11;No;NicholasCage.png
1983;107;Monty Python's the Meaning of Life;Comedy;Cleese, John;;Jones, Terry;33;No;NicholasCage.png
1971;121;Red Tent, The;Action;Finch, Peter;;Kalatozov, Mikhail;7;No;NicholasCage.png
1945;82;Dakota;Western;Wayne, John;;Kane, Joseph;27;No;johnWayne.png
1952;112;Viva Zapata!;Drama;Brando, Marlon;;Kazan, Elia;86;Yes;brando.png
1968;133;Green Berets, The;War;Wayne, John;;Kellogg, Ray;36;No;johnWayne.png
1990;90;Big Bad John;Action;English, Doug;;Kennedy, Burt;84;No;NicholasCage.png
1937;71;Ticket of Leave Man, The;Mystery;Slaughter, Tod;;King, George;45;No;NicholasCage.png
1956;106;D-Day, The Sixth of June;War;Taylor, Robert;;Koster, Henry;84;No;NicholasCage.png
1974;121;Apprenticeship of Duddy Kravitz, The;Drama;Dreyfuss, Richard;;Kotcheff, Ted;64;Yes;NicholasCage.png
1971;138;A Clockwork Orange;Science Fiction;McDowell, Malcolm;;Kubrick, Stanley;83;Yes;NicholasCage.png
1991;117;Full Metal Jacket;War;Modine, Matthew;;Kubrick, Stanley;45;No;NicholasCage.png
1943;82;Sanshiro Sugata;Drama;Fujita, Susumu;;Kurosawa, Akira;85;No;NicholasCage.png
1991;97;Rhapsody in August;Drama;Gere, Richard;;Kurosawa, Akira;50;No;NicholasCage.png
1946;110;No Regrets for Our Youth;Drama;Hara, Setsuko;;Kurosawa, Akira;31;No;NicholasCage.png
1960;152;Bad Sleep Well, The;Drama;Mifune, Toshiro;;Kurosawa, Akira;65;No;NicholasCage.png
1951;166;Idiot, The;Drama;Mifune, Toshiro;;Kurosawa, Akira;40;No;NicholasCage.png
1951;83;Rashomon;Drama;Mifune, Toshiro;;Kurosawa, Akira;59;Yes;NicholasCage.png
1962;96;Sanjuro;Mystery;Mifune, Toshiro;;Kurosawa, Akira;6;No;NicholasCage.png
1955;200;Seven Samurai;Drama;Mifune, Toshiro;;Kurosawa, Akira;9;No;NicholasCage.png
1957;110;Throne of Blood;Drama;Mifune, Toshiro;;Kurosawa, Akira;60;No;NicholasCage.png
1961;110;Yojimbo;Action;Mifune, Toshiro;;Kurosawa, Akira;60;No;NicholasCage.png
1980;161;Kagemusha;Drama;Nakadai, Tatsuya;;Kurosawa, Akira;74;Yes;NicholasCage.png
1952;134;Ikiru;Drama;Shimura, Takashi;;Kurosawa, Akira;36;No;NicholasCage.png
1987;90;Empire of Spiritual Ninja;Action;Berlin, Tom;;Lambert, Bruce;26;No;NicholasCage.png
1986;90;Ninja, the Violent Sorcerer;Action;;;Lambert, Bruce;;No;NicholasCage.png
1926;139;Metropolis;Science Fiction;Abel, Alfred;;Lang, Fritz;49;No;NicholasCage.png
1946;106;Cloak & Dagger;Mystery;Cooper, Gary;;Lang, Fritz;55;No;NicholasCage.png
1920;137;Spiders;Drama;De Vogy, Carl;;Lang, Fritz;29;No;NicholasCage.png
1954;90;Human Desire;Drama;Ford, Glenn;;Lang, Fritz;27;No;glennFord.png
1928;130;Spies;Drama;Klein-Rogge, Rudolf;;Lang, Fritz;49;No;NicholasCage.png
1933;120;Testament of Dr. Mabuse, The;Drama;Klein-Rogge, Rudolf;;Lang, Fritz;4;No;NicholasCage.png
1991;95;Fury;Drama;Tracy, Spencer;;Lang, Fritz;48;No;spencerTracy.png
1990;129;Mo' Better Blues;Drama;Washington, Denzel;;Lee, Spike;78;No;NicholasCage.png
1989;30;Matt Talbot;Drama;Ford, Seamus;;Lennon, Biddy W.;35;No;NicholasCage.png
1989;55;Will Rogers, Look Back in Laughter;Comedy;Williams, Robin;;Leo, Malcolm;6;No;NicholasCage.png
1991;130;For a Few Dollars More;Westerns;Eastwood, Clint;;Leone, Sergio;34;No;clintEastwood.png
1944;139;Thirty Seconds over Tokyo;War;Tracy, Spencer;;LeRoy, Mervyn;45;No;spencerTracy.png
1982;93;Class of 1984;Drama;King, Perry;;Lester, Mark L.;23;No;NicholasCage.png
1974;109;Juggernaut;Action;Harris, Richard;;Lester, Richard;63;No;NicholasCage.png
1987;120;Good Morning, Vietnam;Comedy;Williams, Robin;;Levinson, Barry;37;No;NicholasCage.png
1945;94;Blood on the Sun;Drama;Cagney, James;;Lloyd, Frank;76;No;NicholasCage.png
1969;161;Paint Your Wagon;Music;Marvin, Lee;;Logan, Joshua;46;No;NicholasCage.png
1964;105;Ensign Pulver;Comedy;Walker, Robert, Jr.;;Logan, Joshua;16;No;NicholasCage.png
1976;92;Street People;Action;Moore, Roger;;Lucidi, Maurizio;25;No;NicholasCage.png
1984;83;Manhunt, The;Action;Borgnine, Ernest;;Ludman, Larry;34;No;NicholasCage.png
1987;85;Operation Nam;War;Wayne, John Ethan;;Ludman, Larry;37;No;NicholasCage.png
1944;100;Fighting Seabees, The;War;Wayne, John;;Ludwig, Edward;35;No;johnWayne.png
1988;75;Let It Rock;Drama;Hopper, Dennis;;Lynch, David;32;No;lynch.png
1978;90;Eraserhead;Horror;Nance, John;;Lynch, David;2;No;lynch.png
1955;87;Ladykillers, The;Comedy;Guinness, Alec;;Mackendrick, Alexander;28;No;NicholasCage.png
1957;97;Sweet Smell of Success;Drama;Lancaster, Burt;;Mackendrick, Alexander;12;No;burtLancaster.png
1971;88;And Now for Something Completely Different;Comedy;Cleese, John;;MacNaughton, Ian;44;No;NicholasCage.png
1984;92;Crackers;Action;Sutherland, Donald;;Malle, Louis;17;No;NicholasCage.png
1991;89;Green Glove;Drama;Ford, Glenn;;Mate, Rudolph;54;No;glennFord.png
1970;89;Menace on the Mountain;Action;Crowley, Pat;;McEveety, Vincent;69;No;NicholasCage.png
1940;90;In Old California;Western;Wayne, John;;McGann, William;27;No;johnWayne.png
1967;85;Thirty Is a Dangerous Age, Cynthia;Comedy;Moore, Dudley;;McGrath, Joseph;28;No;NicholasCage.png
1980;99;Ffolkes;Action;Moore, Roger;;McLaglen, Andrew V.;62;No;NicholasCage.png
1970;111;Chisum;Western;Wayne, John;;McLaglen, Andrew V.;72;No;johnWayne.png
1990;135;Hunt for Red October, The;Drama;Connery, Sean;;McTiernan, John;8;No;seanConnery.png
1966;123;Closely Watched Trains;Drama;Neckar, Vaclav;;Menzel, Jiri;75;Yes;NicholasCage.png
1973;91;Executive Action;Drama;Lancaster, Burt;;Miller, David;6;No;burtLancaster.png
1942;101;Flying Tigers;Action;Wayne, John;;Miller, David;61;No;johnWayne.png
1991;87;Father's Little Dividend;Comedy;Tracy, Spencer;;Minnelli, Vincente;52;No;spencerTracy.png
1982;92;An Evening with Robin Williams;Comedy;Williams, Robin;;Mischer, Don;68;No;NicholasCage.png
1987;90;Eddie Murphy Raw;Comedy;Murphy, Eddie;;Murphy, Eddie;51;No;NicholasCage.png
1989;118;Harlem Nights;Comedy;Murphy, Eddie;;Murphy, Eddie;11;No;NicholasCage.png
1973;93;Santee;Western;Ford, Glenn;;Nelson, Gary;47;No;glennFord.png
1987;90;Good Father, The;Drama;Hopkins, Anthony;;Newell, Mike;42;No;AnthonyHopkins.png
1971;115;Sometimes a Great Notion;Drama;Newman, Paul;;Newman, Paul;7;No;paulNewman.png
1970;117;Catch Twenty-Two;Comedy;Arkin, Alan;;Nichols, Mike;50;No;NicholasCage.png
1988;90;Dark Age;Action;Jarratt, John;;Nicholson, Arch;3;No;NicholasCage.png
1981;94;Deadline;Mystery;Newman, Barry;;Nicholson, Arch;9;No;paulNewman.png
1935;60;Mysterious Mr. Wong;Mystery;Lugosi, Bela;;Nigh, William;71;No;NicholasCage.png
1988;92;A Month in the Country;Drama;Firth, Colin;;O'Connor, Pat;57;No;NicholasCage.png
1990;97;Prom Night III, The Last Kiss;Horror;Conlon, Tim;;Oliver, Ron;29;No;NicholasCage.png
1990;;Blood in, Blood Out;Drama;Penn, Sean;;Olmos, Edward James;88;No;NicholasCage.png
1989;94;Wrong Arm of the Law, The;Comedy;Sellers, Peter;;Owen, Cliff;25;No;NicholasCage.png
1987;116;Orphans;Drama;Finney, Albert;;Pakula, Alan J.;21;No;NicholasCage.png
1976;139;All the President's Men;Drama;Redford, Robert;;Pakula, Alan J.;45;Yes;NicholasCage.png
1987;73;J-Men Forever;Action;Bergman, Peter;;Patterson, Richard;59;No;NicholasCage.png
1969;144;Wild Bunch, The;Western;Holden, William;;Peckinpah, Sam;50;No;NicholasCage.png
1988;92;Judgement in Berlin;Drama;Sheen, Martin;;Penn, Leo;13;No;NicholasCage.png
1993;;Hot Line, The;Comedy;Boyer, Charles;;Perier, Etienne;70;No;NicholasCage.png
1988;100;Rocket Gibraltar;Drama;Lancaster, Burt;;Petrie, Daniel;26;No;burtLancaster.png
1975;112;Yakuza, The;Action;Mitchum, Robert;;Pollack, Sydney;16;No;NicholasCage.png
1972;116;Jeremiah Johnson;Drama;Redford, Robert;;Pollack, Sydney;88;No;NicholasCage.png
1970;112;Burn!;Drama;Brando, Marlon;;Pontecorvo, Gillo;75;No;brando.png
1973;122;Magnum Force;Action;Eastwood, Clint;;Post, Ted;28;No;clintEastwood.png
1989;86;Cyborg;Action;Van Damme, Jean-Claude;;Pyun, Albert;31;No;NicholasCage.png
1979;108;Prisoner of Zenda, The;Comedy;Sellers, Peter;;Quine, Richard;12;No;NicholasCage.png
1983;86;Scream;Horror;Martin, Pepper;;Quisenberry, Byron;24;No;NicholasCage.png
1986;140;Assault, The;Drama;Lint, Derek De;;Rademakers, Fons;71;Yes;NicholasCage.png
1951;102;Flying Leathernecks;Action;Wayne, John;;Ray, Nicholas;23;No;johnWayne.png
1985;92;What Comes Around;Drama;Reed, Jerry;;Reed, Jerry;49;No;NicholasCage.png
1980;123;Mon Oncle D'Amerique;Comedy;Roger-Pierre;;Resnais, Alain;71;No;NicholasCage.png
1972;92;Culpepper Cattle Company, The;Western;Grimes, Gary;;Richards, Dick;29;No;NicholasCage.png
1983;102;Survivors, The;Comedy;Matthau, Walter;;Ritchie, Michael;52;No;NicholasCage.png
1984;96;Roadhouse Sixty-Six;Action;Dafoe, Willem;;Robinson, John Mark;20;No;NicholasCage.png
1991;60;Burning Poles, Cecil Taylor in Performance;Music;Taylor, Cecil;;Rochlin, Sheldon;82;No;NicholasCage.png
1987;98;Russkies;Action;Hubley, Whip;;Rosenthal, Rick;87;No;NicholasCage.png
1990;96;My Blue Heaven;Comedy;Martin, Steve;;Ross, Herbert;63;No;NicholasCage.png
1990;103;Altered States;Science Fiction;Hurt, William;;Russell, Ken;22;No;NicholasCage.png
1972;128;Cowboys, The;Western;Wayne, John;;Rydell, Mark;58;No;johnWayne.png
1985;95;Code Name, Emerald;Drama;Harris, Ed;;Sanger, Jonathan;22;No;NicholasCage.png
1970;170;Patton;War;Scott, George C.;;Schaffner, Franklin J.;8;Yes;NicholasCage.png
1969;123;Midnight Cowboy;Drama;Hoffman, Dustin;;Schlesinger, John;33;Yes;NicholasCage.png
1985;131;Falcon & the Snowman, The;Drama;Hutton, Timothy;;Schlesinger, John;61;No;NicholasCage.png
1976;112;Maitresse;Drama;Ogier, Bulle;;Schroeder, Barbet;39;No;NicholasCage.png
1987;86;Disorderlies;Comedy;Boys, The Fat;;Schultz, Michael;69;No;NicholasCage.png
1991;;Raging Bull;Drama;De Niro, Robert;;Scorsese, Martin;25;No;NicholasCage.png
1991;60;Garrison Keillor's Home;Comedy;Keillor, Garrison;;Sevush, Herb;6;No;NicholasCage.png
1938;55;Overland Stage Raiders;Western;Wayne, John;;Sherman, George;83;No;johnWayne.png
1938;55;Pals of the Saddle;Western;Wayne, John;;Sherman, George;33;No;johnWayne.png
1982;92;Alone in the Dark;Horror;Schultz, Dwight;;Sholder, Jack;75;No;NicholasCage.png
1971;109;Beguiled, The;Drama;Eastwood, Clint;;Siegel, Don;60;No;clintEastwood.png
1979;112;Escape from Alcatraz;Drama;Eastwood, Clint;;Siegel, Don;22;No;clintEastwood.png
1948;88;Criss Cross;Drama;Lancaster, Burt;;Siodmak, Robert;77;No;burtLancaster.png
1976;132;Midway;War;Heston, Charlton;;Smight, Jack;36;No;NicholasCage.png
1990;126;Indiana Jones & the Last Crusade;Action;Ford, Harrison;;Spielberg, Steven;8;No;NicholasCage.png
1993;90;Duel;Mystery;Weaver, Dennis;;Spielberg, Steven;48;No;NicholasCage.png
1991;193;Separate but Equal;Drama;Poitier, Sidney;;Stevens, George, Jr.;56;No;NicholasCage.png
1924;123;Gosta Berling's Saga;Drama;Hanson, Lars;;Stiller, Mauritz;63;No;NicholasCage.png
1986;120;Platoon;Drama;Sheen, Charlie;;Stone, Oliver;8;Yes;NicholasCage.png
1963;89;Crawling Hand, The;Science Fiction;Breck, Peter;;Strock, Herbert L.;79;No;NicholasCage.png
1971;100;Willy Wonka & the Chocolate Factory;Music;Wilder, Gene;;Stuart, Mel;65;No;NicholasCage.png
1971;88;Joe Kidd;Western;Eastwood, Clint;;Sturges, John;79;No;clintEastwood.png
1985;104;Santa Claus, The Movie;Comedy;Moore, Dudley;;Szwarc, Jeannot;19;No;NicholasCage.png
1938;96;Boys Town;Drama;Tracy, Spencer;;Taurog, Norman;21;Yes;spencerTracy.png
1990;59;Erasure, Live Wild!;Music;;;Taylor, Gavin;48;No;NicholasCage.png
1982;150;A Question of Honor;Drama;Gazzara, Ben;;Taylor, Jud;80;No;NicholasCage.png
1947;61;Check Your Guns;Western;Dean, Eddie;;Taylor, Ray;80;No;NicholasCage.png
1947;56;West to Glory;Western;Dean, Eddie;;Taylor, Ray;43;No;NicholasCage.png
1937;60;Throwback, The;Western;Jones, Buck;;Taylor, Ray;53;No;NicholasCage.png
1992;54;Border Feud;Action;LaRue, Lash;;Taylor, Ray;43;No;NicholasCage.png
1947;58;Fighting Vigilantes, The;Western;LaRue, Lash;;Taylor, Ray;21;No;NicholasCage.png
1947;53;Law of the Lash;Western;LaRue, Lash;;Taylor, Ray;66;No;NicholasCage.png
1949;66;Outlaw Country;Western;LaRue, Lash;;Taylor, Ray;62;No;NicholasCage.png
1992;53;Return of the Lash;Action;LaRue, Lash;;Taylor, Ray;78;No;NicholasCage.png
1937;60;Mystery of the Hooded Horsemen;Western;Ritter, Tex;;Taylor, Ray;52;No;NicholasCage.png
1937;60;Tex Rides with the Boy Scouts;Western;Ritter, Tex;;Taylor, Ray;17;No;NicholasCage.png
1949;59;Shadows of the West;Western;Wilson, Whip;;Taylor, Ray;40;No;NicholasCage.png
1991;102;Instant Karma;Comedy;Cassidy, David;;Taylor, Roderick;47;No;NicholasCage.png
1957;73;Time Lock;Drama;Connery, Sean;;Thomas, Gerald;5;No;seanConnery.png
1953;79;Appointment in Honduras;Drama;Ford, Glenn;;Tourneur, Jacques;7;No;glennFord.png
1982;136;Danton;Drama;Depardieu, Gérard;;Wajda, Andrzej;5;No;NicholasCage.png
1960;164;Alamo, The;Action;Wayne, John;;Wayne, John;29;No;johnWayne.png
1986;91;La Chevre, (The Goat);Drama;Depardieu, Gérard;;Veber, Francis;24;No;NicholasCage.png
1985;109;Les Comperes;Comedy;Richard, Pierre;;Veber, Francis;54;No;NicholasCage.png
1990;128;Dead Poets Society;Drama;Williams, Robin;;Weir, Peter;8;Yes;NicholasCage.png
1952;93;Othello, The Lost Masterpiece;Drama;Welles, Orson;;Welles, Orson;23;No;NicholasCage.png
1949;119;Battleground, The;War;Johnson, Van;;Wellman, William;7;No;NicholasCage.png
1976;176;Kings of the Road (In the Course of Time);Drama;Vogler, Rudiger;;Wenders, Wim;41;No;NicholasCage.png
1990;98;Hiroshima;Drama;Nelson, Judd;;Werner, Peter;17;No;NicholasCage.png
1982;111;Return of Martin Guerre, The;Drama;Depardieu, Gérard;;Vigne, Daniel;51;No;NicholasCage.png
1956;97;Somebody up There Likes Me;Drama;Newman, Paul;;Wise, Robert;56;No;paulNewman.png
1955;57;Jack Benny Show;Comedy;Benny, Jack;;;51;No;NicholasCage.png
1962;182;Mutiny on the Bounty;Action;Brando, Marlon;;;35;No;brando.png
1989;;Death Valley Days, Deadly Decision;Western;Caan, James;;;9;No;NicholasCage.png
1986;60;Monty Python's Flying Circus;Comedy;Chapman, Graham;;;4;No;NicholasCage.png
1986;60;Monty Python's Flying Circus, Vol 1.;Comedy;Chapman, Graham;;;24;No;NicholasCage.png
1986;59;Monty Python's Flying Circus, Vol 2.;Comedy;Chapman, Graham;;;79;No;NicholasCage.png
1986;58;Monty Python's Flying Circus, Vol 3.;Comedy;Chapman, Graham;;;63;No;NicholasCage.png
1990;;Valkenvania;Comedy;Chase, Chevy;;;82;No;NicholasCage.png
1982;101;Secret Policeman's Other Ball, The;Comedy;Cleese, John;;;86;No;NicholasCage.png
1981;127;Taming of the Shrew, The;Drama;Cleese, John;;;2;No;NicholasCage.png
1964;;From Russia with Love;Action;Connery, Sean;;;6;No;seanConnery.png
1993;108;Offence, The;Mystery;Connery, Sean;;;6;No;seanConnery.png
1992;60;Hollywood Mavericks;Comedy;Coppola, Francis Ford;;;22;No;NicholasCage.png
1990;60;Live at Harrah's;Comedy;Cosby, Bill;;;6;No;NicholasCage.png
1992;52;Persuaders, The Overture, The;Mystery;Curtis, Tony;;;40;No;NicholasCage.png
1977;255;Nineteen Hundred;Drama;De Niro, Robert;;;82;No;NicholasCage.png
1989;90;Van, The;Comedy;DeVito, Danny;;;5;No;NicholasCage.png
1972;15;My Country Right or Wrong;War;Douglas, Michael;;;21;No;NicholasCage.png
1991;;Clint Eastwood Collection, The;Westerns;Eastwood, Clint;;;11;No;clintEastwood.png
1991;;Complete Dirty Harry, Magnum Force, The;Action;Eastwood, Clint;;;53;No;clintEastwood.png
1992;92;Dead Pool, The;Action;Eastwood, Clint;;;26;No;clintEastwood.png
1992;163;Good, the Bad & the Ugly, The;Westerns;Eastwood, Clint;;;68;No;clintEastwood.png
1959;60;Rawhide, Premiere Episode;Western;Eastwood, Clint;;;54;No;clintEastwood.png
1992;118;Tightrope;Mystery;Eastwood, Clint;;;55;No;clintEastwood.png
1987;95;Hearts of Fire;Drama;Everett, Rupert;;;25;No;NicholasCage.png
1992;165;How the West Was Won;Western;Fonda, Henry;;;45;No;NicholasCage.png
1992;;Mummy's Hand, The;Mystery;Foran, Dick;;;54;No;NicholasCage.png
1993;88;Great White Death;Action;Ford, Glenn;;;26;No;glennFord.png
1986;119;Mosquito Coast, The;Drama;Ford, Harrison;;;54;No;NicholasCage.png
1993;102;Today We Kill....Tomorrow We Die;Western;Ford, Montgomery;;;25;No;NicholasCage.png
1991;;Tormenta Sobre Arizona;Drama;Ford, Wallace;;;81;No;NicholasCage.png
1989;116;Back to the Future II;Comedy;Fox, Michael J.;;;65;No;NicholasCage.png
1959;60;Maverick, Duel at Sundown;Western;Garner, James;;;26;No;NicholasCage.png
1983;;Shakespeare Series;Drama;Gielgud, John;;;23;No;NicholasCage.png
1973;105;Deadly Trackers;Western;Harris, Richard;;;54;No;NicholasCage.png
1992;72;American Film Institute, Alfred Hitchcock;Mystery;Hitchcock, Alfred;;;70;No;NicholasCage.png
1990;;A Married Man;Drama;Hopkins, Anthony;;;79;No;AnthonyHopkins.png
1982;208;Othello;Drama;Hopkins, Anthony;;;84;No;AnthonyHopkins.png
1975;85;Only Way Home, The;Drama;Hopkins, Bo;;;60;No;NicholasCage.png
1953;120;Tales of Tomorrow;Horror;Karloff, Boris;;;0;No;NicholasCage.png
1991;128;Inherit the Wind;Drama;Kelly, Gene;;;18;No;NicholasCage.png
1990;45;This Is Horror;Horror;King, Stephen;;;3;No;NicholasCage.png
1992;112;Conversation Piece;Drama;Lancaster, Burt;;;1;No;burtLancaster.png
1992;105;Crimson Pirate, The;Action;Lancaster, Burt;;;60;No;burtLancaster.png
1992;83;Devil's Disciple, The;Mystery;Lancaster, Burt;;;65;No;burtLancaster.png
1992;166;Hallelujah Trail, The;Drama;Lancaster, Burt;;;6;No;burtLancaster.png
1992;133;Train, The;Action;Lancaster, Burt;;;68;No;burtLancaster.png
1986;49;Jay Leno: The American Dream;Comedy;Leno, Jay;;;67;No;NicholasCage.png
1990;92;Primal Rage;Mystery;Lowe, Patrick;;;3;No;NicholasCage.png
1990;50;Industrial Symphony, The Dream of the Broken-Hearted;Music;Lynch, David;;;49;No;lynch.png
1986;52;Howie Mandel's North American Watusi Tour;Comedy;Mandel, Howie;;;65;No;NicholasCage.png
1989;90;Branford Marsalis, Steep;Music;Marsalis, Branford;;;52;No;NicholasCage.png
1991;98;L. A. Story;Comedy;Martin, Steve;;;81;No;NicholasCage.png
1986;60;Steve Martin Live!;Comedy;Martin, Steve;;;3;No;NicholasCage.png
1974;60;Steve Martin, The Funnier Side of Eastern Canada;Comedy;Martin, Steve;;;34;No;NicholasCage.png
1993;;Runaway Barge, The;Action;Matheson, Tim;;;38;No;NicholasCage.png
1992;101;Romulus & the Sabines;Action;Moore, Roger;;;76;No;NicholasCage.png
1989;;Saint, The;Mystery;Moore, Roger;;;29;No;NicholasCage.png
1983;91;Strange Brew;Comedy;Moranis, Rick;;;24;No;NicholasCage.png
1990;98;Another Forty-Eight Hours;Action;Murphy, Eddie;;;54;No;NicholasCage.png
1989;;Best of Eddie Murphy, Saturday Night Live, The;Comedy;Murphy, Eddie;;;56;No;NicholasCage.png
1991;99;What about Bob?;Comedy;Murray, Bill;;;6;No;NicholasCage.png
1953;91;Mummy's Revenge, The;Horror;Naschy, Paul;;;56;No;NicholasCage.png
1992;121;Harper;Mystery;Newman, Paul;;;86;No;paulNewman.png
1992;102;Left Handed Gun, The;Western;Newman, Paul;;;26;No;paulNewman.png
1989;;Once upon a Wheel;Action;Newman, Paul;;;40;No;paulNewman.png
1992;136;Prize, The;Drama;Newman, Paul;;;66;No;paulNewman.png
1968;;Secret War of Harry Frigg, The;Comedy;Newman, Paul;;;28;No;paulNewman.png
1990;;Two Jakes, The;Mystery;Nicholson, Jack;;;3;No;NicholasCage.png
1989;61;Exile in Concert;Music;Pennington, J. P.;;;12;No;NicholasCage.png
1987;60;Joe Piscopo New Jersey Special;Comedy;Piscopo, Joe;;;14;No;NicholasCage.png
1991;60;Joe Piscopo Video, The;Comedy;Piscopo, Joe;;;44;No;NicholasCage.png
1989;;Death Valley Days, No Gun Behind His Badge;Western;Reagan, Ronald;;;1;No;NicholasCage.png
1988;96;Salsa: The Motion Picture;Drama;Rosa, Robby;;;26;No;NicholasCage.png
1991;80;Hollywood's Greatest War Movies;War;Scott, George C.;;;41;No;NicholasCage.png
1991;91;Out for Justice;Action;Seagal, Steven;;;2;No;NicholasCage.png
1956;27;Case of the Mukkinese Battle Horn, The;Comedy;Sellers, Peter;;;45;No;NicholasCage.png
1953;75;Goon Show Movie, The;Comedy;Sellers, Peter;;;80;No;NicholasCage.png
1975;95;Great McGonagall, The;Comedy;Sellers, Peter;;;72;No;NicholasCage.png
1991;101;I'm All Right Jack;Comedy;Sellers, Peter;;;23;No;NicholasCage.png
1991;101;Magic Christian, The;Comedy;Sellers, Peter;;;75;No;NicholasCage.png
1960;91;Never Let Go;Action;Sellers, Peter;;;5;No;NicholasCage.png
1991;121;Pink Panther, The;Comedy;Sellers, Peter;;;77;No;NicholasCage.png
1991;84;Two-Way Stretch;Comedy;Sellers, Peter;;;7;No;NicholasCage.png
1988;65;Face at the Window, The;Horror;Slaughter, Tod;;;79;No;NicholasCage.png
1958;92;Tom Thumb;Science Fiction;Tamblyn, Russ;;;30;No;NicholasCage.png
1989;90;Beartooth;Action;Taylor, Dub;;;70;No;NicholasCage.png
1979;90;James Taylor in Concert;Music;Taylor, James;;;38;No;NicholasCage.png
1942;253;Gangbusters;Drama;Taylor, Kent;;;31;No;NicholasCage.png
1992;;El Rublo de las Dos Caras;Action;Taylor, Robert;;;83;No;NicholasCage.png
1992;87;Law & Jake Wade, The;Drama;Taylor, Robert;;;68;No;NicholasCage.png
1967;105;Chuka;Western;Taylor, Rod;;;47;No;NicholasCage.png
1980;93;Cry of the Innocent;Drama;Taylor, Rod;;;13;No;NicholasCage.png
1991;108;Edison the Man;Drama;Tracy, Spencer;;;19;No;spencerTracy.png
1991;101;Keeper of the Flame;Drama;Tracy, Spencer;;;76;No;spencerTracy.png
1991;92;Spencer Tracy Legacy, The;Comedy;Tracy, Spencer;;;44;No;spencerTracy.png
1957;60;Cheyenne, The Iron Trail;Western;Walker, Clint;;;1;No;NicholasCage.png
1992;56;Dawn Rider, The;Western;Wayne, John;;;44;No;johnWayne.png
1993;;Duke, The Films of John Wayne;Western;Wayne, John;;;70;No;johnWayne.png
1939;55;Frontier Horizon;Western;Wayne, John;;;73;No;johnWayne.png
1934;54;Hell Town;Western;Wayne, John;;;23;No;johnWayne.png
1932;;Hurricane Express;Western;Wayne, John;;;7;No;johnWayne.png
1932;210;Hurricane Express, The;Action;Wayne, John;;;68;No;johnWayne.png
1965;165;In Harm's Way;War;Wayne, John;;;66;No;johnWayne.png
1991;;John Wayne Collection, Red River, The;War;Wayne, John;;;49;No;johnWayne.png
1992;;John Wayne Collector's Limited Edition;War;Wayne, John;;;3;No;johnWayne.png
1991;;John Wayne Four Pack;Western;Wayne, John;;;58;No;johnWayne.png
1939;112;John Wayne Matinee Double Feature, No. 2;Western;Wayne, John;;;3;No;johnWayne.png
1939;110;John Wayne Matinee Double Feature, No. 3;Western;Wayne, John;;;24;No;johnWayne.png
1938;110;John Wayne Matinee Double Feature, No. 4;Western;Wayne, John;;;28;No;johnWayne.png
1990;;John Wayne Six Pack;Western;Wayne, John;;;87;No;johnWayne.png
1991;;John Wayne Western Greats, Rio Bravo;Western;Wayne, John;;;22;No;johnWayne.png
1991;56;King of the Pecos;Western;Wayne, John;;;78;No;johnWayne.png
1992;59;Lawless Frontier;Western;Wayne, John;;;8;No;johnWayne.png
1991;52;Lawless Frontier, The;Western;Wayne, John;;;35;No;johnWayne.png
1991;56;Lawless Nineties, The;Western;Wayne, John;;;3;No;johnWayne.png
1934;54;Lucky Texan;Western;Wayne, John;;;48;No;johnWayne.png
1992;112;McQ;Action;Wayne, John;;;5;No;johnWayne.png
1993;;Neath Arizona Skies;Western;Wayne, John;;;73;No;johnWayne.png
1991;54;Neath the Arizona Skies;Western;Wayne, John;;;28;No;johnWayne.png
1991;53;Randy Rides Alone;Western;Wayne, John;;;75;No;johnWayne.png
1993;58;Range Feud;Western;Wayne, John;;;77;No;johnWayne.png
1992;134;Red River;Western;Wayne, John;;;16;No;johnWayne.png
1991;52;Riders of Destiny;Western;Wayne, John;;;30;No;johnWayne.png
1990;;Sagebrush Trail;Western;Wayne, John;;;23;No;johnWayne.png
1932;226;Shadow of the Eagle, The;Action;Wayne, John;;;19;No;johnWayne.png
1989;103;Blood & Guns;Action;Welles, Orson;;;43;No;NicholasCage.png
1988;78;Hot Money;Drama;Welles, Orson;;;19;No;NicholasCage.png
1977;75;Comedy Tonight;Comedy;Williams, Robin;;;18;No;NicholasCage.png
1991;65;Robin Williams;Comedy;Williams, Robin;;;4;No;NicholasCage.png
================================================
FILE: FUNDING.yml
================================================
custom: https://learndataengineering.com/p/academy
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
What is this Book?
How to Contribute
YouTube
Twitter
Amazon Shop
## If You Like This Book & Need More Help
Check out my Data Engineering Academy at LearnDataEngineering.com trusted by almost 2,000 students!
**Visit learndataengineering.com:** [Click Here](https://learndataengineering.com)
- Learn Data Engineering with our online Academy
- Perfect for becoming a Data Engineer or add Data Engineering to your skillset
- Proven process based on years of experience and hundreds of hours of personal coaching
- Over 30 prepared courses on the most important techniques, fundamental tools and platforms plus our
- Associate Data Engineer Certification
- Academy Discord server with over 1,000 members
## Support This Book For Free!
- **Amazon:** [Click Here](https://www.amazon.com/shop/plumbersofdatascience) buy whatever you like from Amazon using this link* (Also check out my complete podcast gear and books)
## Here's what's new:
Find the change log with all recent updates here: [SEE UPDATES](sections/10-Updates.md)
# Contents:
- [Introduction](sections/01-Introduction.md)
- [Basic Engineering Skills](sections/02-BasicSkills.md)
- [Advanced Engineering Skills](sections/03-AdvancedSkills.md)
- [Free Hands On Courses / Tutorials](sections/04-HandsOnCourse.md)‚
- [Case Studies](sections/05-CaseStudies.md)
- [Best Practices Cloud Platforms](sections/06-BestPracticesCloud.md)
- [130+ Data Sources Data Science](sections/07-DataSources.md)
- [1001 Interview Questions](sections/08-InterviewQuestions.md)
- [Recommended Books, Courses, and Podcasts](sections/09-BooksAndCourses.md)
- [Updates](sections/10-Updates.md)
- [How To Contribute](#how-to-contribute)
- [Support What You Like](#support)
- [Important Links](#important-links)
# Full Table Of Contents:
## Introduction
- [What is this Cookbook](sections/01-Introduction.md#what-is-this-cookbook)
- [Data Engineers](sections/01-Introduction.md#data-engineers)
- [My Data Science Platform Blueprint](sections/01-Introduction.md#my-data-science-platform-blueprint)
- [Connect](sections/01-Introduction.md#connect)
- [Buffer](sections/01-Introduction.md#buffer)
- [Processing Framework](sections/01-Introduction.md#processing-framework)
- [Store](sections/01-Introduction.md#store)
- [Visualize](sections/01-Introduction.md#visualize)
- [Who Companies Need](sections/01-Introduction.md#who-companies-need)
- [How to Learn Data Engineering](sections/01-Introduction.md#how-to-learn-data-engineering)
- [Andreas on the Super Data Science Podcast](sections/01-Introduction.md#Interview-with-Andreas-on-the-Super-Data-Science-Podcast)
- [Building Blocks to Learn Data Engineering](sections/01-Introduction.md#building-blocks-to-learn-data-engineering)
- [Roadmap for Beginners](sections/01-Introduction.md#roadmap-for-beginners)
- [Roadmap for Data Analysts](sections/01-Introduction.md#roadmap-for-data-analysts)
- [Roadmap for Data Scientists](sections/01-Introduction.md#roadmap-for-data-scientists)
- [Roadmap for Software Engineers](sections/01-Introduction.md#roadmap-for-software-engineers)
- [Data Engineers Skills Matrix](sections/01-Introduction.md#data-engineers-skills-matrix)
- [How to Become a Senior Data Engineer](sections/01-Introduction.md#how-to-become-a-senior-data-engineer)
## Basic Engineering Skills
- [Learn To Code](sections/02-BasicSkills.md#learn-to-code)
- [Get Familiar With Git](sections/02-BasicSkills.md#get-familiar-with-git)
- [Agile Development](sections/02-BasicSkills.md#agile-development)
- [Why is agile so important?](sections/02-BasicSkills.md#Why-is-agile-so-important)
- [Agile rules I learned over the years](sections/02-BasicSkills.md#agile-rules-i-learned-over-the-years)
- [Agile Frameworks](sections/02-BasicSkills.md#agile-frameworks)
- [Scrum](sections/02-BasicSkills.md#scrum)
- [OKR](sections/02-BasicSkills.md#okr)
- [Software Engineering Culture](sections/02-BasicSkills.md#software-engineering-culture)
- [Learn how a Computer Works](sections/02-BasicSkills.md#learn-how-a-computer-works)
- [Data Network Transmission](sections/02-BasicSkills.md#data-network-transmission)
- [Security and Privacy](sections/02-BasicSkills.md#security-and-privacy)
- [SSL Public and Private Key Certificates](sections/02-BasicSkills.md#ssl-public-and-private-key-Certificates)
- [JSON Web Tokens](sections/02-BasicSkills.md#json-web-tokens)
- [GDPR regulations](sections/02-BasicSkills.md#gdpr-regulations)
- [Linux](sections/02-BasicSkills.md#linux)
- [OS Basics](sections/02-BasicSkills.md#os-basics)
- [Shell scripting](sections/02-BasicSkills.md#shell-scripting)
- [Cron Jobs](sections/02-BasicSkills.md#cron-jobs)
- [Packet Management](sections/02-BasicSkills.md#packet-management)
- [Docker](sections/02-BasicSkills.md#docker)
- [What is Docker and How it Works](sections/02-BasicSkills.md#what-is-docker-and-what-do-you-use-it-for)
- [Don't Mess Up Your System](sections/02-BasicSkills.md#dont-mess-up-your-system)
- [Preconfigured Images](sections/02-BasicSkills.md#preconfigured-images)
- [Take it With You](sections/02-BasicSkills.md#take-it-with-you)
- [Kubernetes Container Deployment](sections/02-BasicSkills.md#kubernetes-container-deployment)
- [How to Create Start and Stop a Container](sections/02-BasicSkills.md#how-to-create-start-stop-a-container)
- [Docker Micro Services](sections/02-BasicSkills.md#docker-micro-services)
- [Kubernetes](sections/02-BasicSkills.md#kubernetes)
- [Why and How To Do Docker Container Orchestration](sections/02-BasicSkills.md#why-and-how-to-do-docker-container-orchestration)
- [Userful Docker Commands](sections/02-BasicSkills.md#useful-docker-commands)
- [The Cloud](sections/02-BasicSkills.md#the-cloud)
- [IaaS vs PaaS vs SaaS](sections/02-BasicSkills.md#iaas-vs-paas-vs-saas)
- [AWS Azure IBM Google IBM](sections/02-BasicSkills.md#aws-azure-ibm-google)
- [Cloud vs On-Premises](sections/02-BasicSkills.md#cloud-vs-on-premises)
- [Security](sections/02-BasicSkills.md#security)
- [Hybrid Clouds](sections/02-BasicSkills.md#hybrid-clouds)
- [Security Zone Design](sections/02-BasicSkills.md#security-zone-design)
- [How to secure a multi layered application](sections/02-BasicSkills.md#how-to-secure-a-multi-layered-application)
- [Cluster security with Kerberos](sections/02-BasicSkills.md#cluster-security-with-kerberos)
## Advanced Engineering Skills
- [Data Science Platform](sections/03-AdvancedSkills.md#data-science-platform)
- [Why a Good Data Platform Is Important](sections/03-AdvancedSkills.md#why-a-good-data-platform-is-important)
- [Big Data vs Data Science and Analytics](sections/03-AdvancedSkills.md#Big-Data-vs-Data-Science-and-Analytics)
- [The 4 Vs of Big Data](sections/03-AdvancedSkills.md#the-4-vs-of-big-data)
- [Why Big Data](sections/03-AdvancedSkills.md#why-big-data)
- [Planning is Everything](sections/03-AdvancedSkills.md#planning-is-everything)
- [The Problem with ETL](sections/03-AdvancedSkills.md#the-problem-with-etl)
- [Scaling Up](sections/03-AdvancedSkills.md#scaling-up)
- [Scaling Out](sections/03-AdvancedSkills.md#scaling-out)
- [When not to Do Big Data](sections/03-AdvancedSkills.md#please-dont-go-big-data)
- [81 Platform & Pipeline Design Questions](sections/03-AdvancedSkills.md#81-platform-and-pipeline-design-questions)
- [Data Source Questions](sections/03-AdvancedSkills.md#data-source-questions)
- [Goals and Destination Questions](sections/03-AdvancedSkills.md#goals-and-destination-questions)
- [Connect](sections/03-AdvancedSkills.md#connect)
- [REST APIs](sections/03-AdvancedSkills.md#rest-apis)
- [API Design](sections/03-AdvancedSkills.md#api-design)
- [Implemenation Frameworks](sections/03-AdvancedSkills.md#implementation-frameworks)
- [Security](sections/03-AdvancedSkills.md#security)
- [Apache Nifi](sections/03-AdvancedSkills.md#apache-nifi)
- [Logstash](sections/03-AdvancedSkills.md#logstash)
- [Buffer](sections/03-AdvancedSkills.md#buffer)
- [Apache Kafka](sections/03-AdvancedSkills.md#apache-kafka)
- [Why a Message Queue Tool?](sections/03-AdvancedSkills.md#why-a-message-queue-tool)
- [Kafka Architecture](sections/03-AdvancedSkills.md#kafka-architecture)
- [Kafka Topics](sections/03-AdvancedSkills.md#what-are-topics)
- [Kafka and Zookeeper](sections/03-AdvancedSkills.md#what-does-zookeeper-have-to-do-with-kafka)
- [How to Produce and Consume Messages](sections/03-AdvancedSkills.md#how-to-produce-and-consume-messages)
- [Kafka Commands](sections/03-AdvancedSkills.md#kafka-commands)
- [Apache Redis Pub-Sub](sections/03-AdvancedSkills.md#redis-pub-sub)
- [AWS Kinesis](sections/03-AdvancedSkills.md#apache-kafka)
- [Google Cloud PubSub](sections/03-AdvancedSkills.md#google-cloud-pubsub)
- [Processing Frameworks](sections/03-AdvancedSkills.md#processing-frameworks)
- [Lambda and Kappa Architecture](sections/03-AdvancedSkills.md#lambda-and-kappa-architecture)
- [Batch Processing](sections/03-AdvancedSkills.md#batch-processing)
- [Stream Processing](sections/03-AdvancedSkills.md#stream-processing)
- [Three Methods of Streaming](sections/03-AdvancedSkills.md#three-methods-of-streaming)
- [At Least Once](sections/03-AdvancedSkills.md#at-least-once)
- [At Most Once](sections/03-AdvancedSkills.md#at-most-once)
- [Exactly Once](sections/03-AdvancedSkills.md#exactly-once)
- [Check The Tools](sections/03-AdvancedSkills.md#check-the-tools)
- [Should You do Stream or Batch Processing](sections/03-AdvancedSkills.md#should-you-do-stream-or-batch-processing)
- [Is ETL still relevant for Analytics?](sections/03-AdvancedSkills.md#is-etl-still-relevant-for-analytics)
- [MapReduce](sections/03-AdvancedSkills.md#mapreduce)
- [How Does MapReduce Work](sections/03-AdvancedSkills.md#How-does-mapreduce-work)
- [MapReduce](sections/03-AdvancedSkills.md#mapreduce)
- [MapReduce Example](sections/03-AdvancedSkills.md#example)
- [MapReduce Limitations](sections/03-AdvancedSkills.md#What-is-the-limitation-of-mapreduce)
- [Apache Spark](sections/03-AdvancedSkills.md#apache-spark)
- [What is the Difference to MapReduce?](sections/03-AdvancedSkills.md#what-is-the-difference-to-MapReduce)
- [How Spark Fits to Hadoop](sections/03-AdvancedSkills.md#how-does-spark-fit-to-hadoop)
- [Spark vs Hadoop](sections/03-AdvancedSkills.md#wheres-the-difference)
- [Spark and Hadoop a Perfect Fit](sections/03-AdvancedSkills.md#spark-and-hadoop-is-a-perfect-fit)
- [Spark on YARn](sections/03-AdvancedSkills.md#spark-on-yarn)
- [My Simple Rule of Thumb](sections/03-AdvancedSkills.md#my-simple-rule-of-thumb)
- [Available Languages](sections/03-AdvancedSkills.md#available-languages)
- [Spark Driver Executor and SparkContext](sections/03-AdvancedSkills.md#how-spark-works-driver-executor-sparkcontext)
- [Spark Batch vs Stream processing](sections/03-AdvancedSkills.md#spark-batch-vs-stream-processing)
- [How Spark uses Data From Hadoop](sections/03-AdvancedSkills.md#How-does-spark-use-data-from-hadoop)
- [What are RDDs and How to Use Them](sections/03-AdvancedSkills.md#what-are-rdds-and-how-to-use-them)
- [SparkSQL How and Why to Use It](sections/03-AdvancedSkills.md#available-languages)
- [What are Dataframes and How to Use Them](sections/03-AdvancedSkills.md#what-are-dataframes-how-to-use-them)
- [Machine Learning on Spark (TensorFlow)](sections/03-AdvancedSkills.md#machine-learning-on-spark-tensor-flow)
- [MLlib](sections/03-AdvancedSkills.md#mllib)
- [Spark Setup](sections/03-AdvancedSkills.md#spark-setup)
- [Spark Resource Management](sections/03-AdvancedSkills.md#spark-resource-management)
- [AWS Lambda](sections/03-AdvancedSkills.md#apache-flink)
- [Apache Flink](sections/03-AdvancedSkills.md#apache-flink)
- [Elasticsearch](sections/03-AdvancedSkills.md#elasticsearch)
- [Apache Drill](sections/03-AdvancedSkills.md#apache-drill)
- [StreamSets](sections/03-AdvancedSkills.md#streamsets)
- [Store](sections/03-AdvancedSkills.md#store)
- [Analytical Data Stores](03-AdvancedSkills.md#analytical-data-stores)
- [Data Warehouse vs Data Lake](sections/03-AdvancedSkills.md#data-warehouse-vs-data-lake)
- [Snowflake and dbt](sections/03-AdvancedSkills.md#snowflake-and-dbt)
- [Transactional Data Stores](sections/03-AdvancedSkills.md#transactional-data-stores)
- [SQL Databases](sections/03-AdvancedSkills.md#sql-databases)
- [PostgreSQL DB](sections/03-AdvancedSkills.md#postgresql-db)
- [Database Design](sections/03-AdvancedSkills.md#database-design)
- [SQL Queries](sections/03-AdvancedSkills.md#sql-queries)
- [Stored Procedures](sections/03-AdvancedSkills.md#stored-procedures)
- [ODBC/JDBC Server Connections](sections/03-AdvancedSkills.md#odbc-jdbc-server-connections)
- [NoSQL Stores](sections/03-AdvancedSkills.md#nosql-stores)
- [HBase KeyValue Store](sections/03-AdvancedSkills.md#keyvalue-stores-hbase)
- [HDFS Document Store](sections/03-AdvancedSkills.md#document-stores-hdfs)
- [MongoDB Document Store](sections/03-AdvancedSkills.md#document-stores-mongodb)
- [Elasticsearch Document Store](sections/03-AdvancedSkills.md#Elasticsearch-search-engine-and-document-store)
- [Hive Warehouse](sections/03-AdvancedSkills.md#hive-warehouse)
- [Impala](sections/03-AdvancedSkills.md#impala)
- [Kudu](sections/03-AdvancedSkills.md#kudu)
- [Apache Druid](sections/03-AdvancedSkills.md#apache-druid)
- [InfluxDB Time Series Database](sections/03-AdvancedSkills.md#influxdb-time-series-database)
- [Greenplum MPP Database](sections/03-AdvancedSkills.md#mpp-databases-greenplum)
- [Visualize](sections/03-AdvancedSkills.md#visualize)
- [Android and IOS](sections/03-AdvancedSkills.md#android-and-ios)
- [API Design for Mobile Apps](sections/03-AdvancedSkills.md#how-to-design-apis-for-mobile-apps)
- [Dashboards](sections/03-AdvancedSkills.md#dashboards)
- [Grafana](sections/03-AdvancedSkills.md#grafana)
- [Kibana](sections/03-AdvancedSkills.md#kibana)
- [Webservers](sections/03-AdvancedSkills.md#how-to-use-webservers-to-display-content)
- [Tomcat](sections/03-AdvancedSkills.md#tomcat)
- [Jetty](sections/03-AdvancedSkills.md#jetty)
- [NodeRED](sections/03-AdvancedSkills.md#nodered)
- [React](sections/03-AdvancedSkills.md#react)
- [Business Intelligence Tools](sections/03-AdvancedSkills.md#business-intelligence-tools)
- [Tableau](sections/03-AdvancedSkills.md#tableau)
- [Power BI](sections/03-AdvancedSkills.md#power-bi)
- [Quliksense](sections/03-AdvancedSkills.md#quliksense)
- [Identity & Device Management](sections/03-AdvancedSkills.md#Identity-and-device-management)
- [What Is A Digital Twin](sections/03-AdvancedSkills.md#what-is-a-digital-twin)
- [Active Directory](sections/03-AdvancedSkills.md#active-directory)
- [Machine Learning](sections/03-AdvancedSkills.md#machine-learning)
- [How to do Machine Learning in production](sections/03-AdvancedSkills.md#how-to-domachine-learning-in-production)
- [Why machine learning in production is harder then you think](sections/03-AdvancedSkills.md#why-machine-learning-in-production-is-harder-then-you-think)
- [Models Do Not Work Forever](sections/03-AdvancedSkills.md#models-do-not-work-forever)
- [Where are The Platforms That Support Machine Learning](sections/03-AdvancedSkills.md#where-are-the-platforms-that-support-this)
- [Training Parameter Management](sections/03-AdvancedSkills.md#training-parameter-management)
- [How to Convince People That Machine Learning Works](sections/03-AdvancedSkills.md#how-to-convince-people-machine-learning-works)
- [No Rules No Physical Models](sections/03-AdvancedSkills.md#no-rules-no-physical-models)
- [You Have The Data. Use It!](sections/03-AdvancedSkills.md#you-have-the-data-use-it)
- [Data is Stronger Than Opinions](sections/03-AdvancedSkills.md#data-is-stronger-than-opinions)
- [AWS Sagemaker](sections/03-AdvancedSkills.md#aws-sagemaker)
## Hands On Course
- [Free Data Engineering Course with AWS, TDengine, Docker and Grafana](sections/04-HandsOnCourse.md#free-data-engineering-course-with-aws-tdengine-docker-and-grafana)
- [Monitor your data in dbt & detect quality issues with Elementary](sections/04-HandsOnCourse.md#monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary)
- [Solving Engineers 4 Biggest Airflow Problems](sections/04-HandsOnCourse.md#solving-engineers-4-biggest-airflow-problems)
- [The best alternative to Airlfow? Mage.ai](sections/04-HandsOnCourse.md#the-best-alternative-to-airlfow?-mage.ai)
## Case Studies
- [Data Science @Airbnb](sections/05-CaseStudies.md#data-science-at-Airbnb)
- [Data Science @Amazon](sections/05-CaseStudies.md#data-science-at-Amazon)
- [Data Science @Baidu](sections/05-CaseStudies.md#data-science-at-Baidu)
- [Data Science @Blackrock](sections/05-CaseStudies.md#data-science-at-Blackrock)
- [Data Science @BMW](sections/05-CaseStudies.md#data-science-at-BMW)
- [Data Science @Booking.com](sections/05-CaseStudies.md#data-science-at-Booking.com)
- [Data Science @CERN](sections/05-CaseStudies.md#data-science-at-CERN)
- [Data Science @Disney](sections/05-CaseStudies.md#data-science-at-Disney)
- [Data Science @DLR](sections/05-CaseStudies.md#data-science-at-DLR)
- [Data Science @Drivetribe](sections/05-CaseStudies.md#data-science-at-Drivetribe)
- [Data Science @Dropbox](sections/05-CaseStudies.md#data-science-at-Dropbox)
- [Data Science @Ebay](sections/05-CaseStudies.md#data-science-at-Ebay)
- [Data Science @Expedia](sections/05-CaseStudies.md#data-science-at-Expedia)
- [Data Science @Facebook](sections/05-CaseStudies.md#data-science-at-Facebook)
- [Data Science @Google](sections/05-CaseStudies.md#data-science-at-Google)
- [Data Science @Grammarly](sections/05-CaseStudies.md#data-science-at-Grammarly)
- [Data Science @ING Fraud](sections/05-CaseStudies.md#data-science-at-ING-Fraud)
- [Data Science @Instagram](sections/05-CaseStudies.md#data-science-at-Instagram)
- [Data Science @LinkedIn](sections/05-CaseStudies.md#data-science-at-LinkedIn)
- [Data Science @Lyft](sections/05-CaseStudies.md#data-science-at-Lyft)
- [Data Science @NASA](sections/05-CaseStudies.md#data-science-at-NASA)
- [Data Science @Netflix](sections/05-CaseStudies.md#data-science-at-Netflix)
- [Data Science @OLX](sections/05-CaseStudies.md#data-science-at-OLX)
- [Data Science @OTTO](sections/05-CaseStudies.md#data-science-at-OTTO)
- [Data Science @Paypal](sections/05-CaseStudies.md#data-science-at-Paypal)
- [Data Science @Pinterest](sections/05-CaseStudies.md#data-science-at-Pinterest)
- [Data Science @Salesforce](sections/05-CaseStudies.md#data-science-at-Salesforce)
- [Data Science @Siemens Mindsphere](sections/05-CaseStudies.md#data-science-at-Siemens-Mindsphere)
- [Data Science @Slack](sections/05-CaseStudies.md#data-science-at-Slack)
- [Data Science @Spotify](sections/05-CaseStudies.md#data-science-at-Spotify)
- [Data Science @Symantec](sections/05-CaseStudies.md#data-science-at-Symantec)
- [Data Science @Tinder](sections/05-CaseStudies.md#data-science-at-Tinder)
- [Data Science @Twitter](sections/05-CaseStudies.md#data-science-at-Twitter)
- [Data Science @Uber](sections/05-CaseStudies.md#data-science-at-Uber)
- [Data Science @Upwork](sections/05-CaseStudies.md#data-science-at-Upwork)
- [Data Science @Woot](sections/05-CaseStudies.md#data-science-at-Woot)
- [Data Science @Zalando](sections/05-CaseStudies.md#data-science-at-Zalando)
## Best Practices Cloud Platforms
- [Amazon Web Services (AWS)](sections/06-BestPracticesCloud.md#aws)
- [Connect](sections/06-BestPracticesCloud.md#Connect)
- [Buffer](sections/06-BestPracticesCloud.md#Buffer)
- [Processing](sections/06-BestPracticesCloud.md#Processing)
- [Store](sections/06-BestPracticesCloud.md#Store)
- [Visualize](sections/06-BestPracticesCloud.md#Visualize)
- [Containerization](sections/06-BestPracticesCloud.md#Containerization)
- [Best Practices](sections/06-BestPracticesCloud.md#Best-Practices)
- [More Details](sections/06-BestPracticesCloud.md#More-Details)
- [Microsoft Azure](sections/06-BestPracticesCloud.md#azure)
- [Connect](sections/06-BestPracticesCloud.md#Connect-1)
- [Buffer](sections/06-BestPracticesCloud.md#Buffer-1)
- [Processing](sections/06-BestPracticesCloud.md#Processing-1)
- [Store](sections/06-BestPracticesCloud.md#Store-1)
- [Visualize](sections/06-BestPracticesCloud.md#Visualize-1)
- [Containerization](sections/06-BestPracticesCloud.md#Containerization-1)
- [Best Practices](sections/06-BestPracticesCloud.md#Best-Practices-1)
- [Google Cloud Platform (GCP)](sections/06-BestPracticesCloud.md#gcp)
- [Connect](sections/06-BestPracticesCloud.md#Connect-2)
- [Buffer](sections/06-BestPracticesCloud.md#Buffer-2)
- [Processing](sections/06-BestPracticesCloud.md#Processing-2)
- [Store](sections/06-BestPracticesCloud.md#Store-2)
- [Visualize](sections/06-BestPracticesCloud.md#Visualize-2)
- [Containerization](sections/06-BestPracticesCloud.md#Containerization-2)
- [Best Practices](sections/06-BestPracticesCloud.md#Best-Practices-2)
## 130+ Free Data Sources For Data Science
- [Student Favorites](sections/07-DataSources.md#Student-Favorites)
- [General And Academic](sections/07-DataSources.md#General-And-Academic)
- [Content Marketing](sections/07-DataSources.md#Content-Marketing)
- [Crime](sections/07-DataSources.md#Crime)
- [Drugs](sections/07-DataSources.md#Drugs)
- [Education](sections/07-DataSources.md#Education)
- [Entertainment](sections/07-DataSources.md#Entertainment)
- [Environmental And Weather Data](sections/07-DataSources.md#Environmental-And-Weather-Data)
- [Financial And Economic Data](sections/07-DataSources.md#Financial-And-Economic-Data])
- [Government And World](sections/07-DataSources.md#Government-And-World)
- [Health](sections/07-DataSources.md#Health)
- [Human Rights](sections/07-DataSources.md#Human-Rights)
- [Labor And Employment Data](sections/07-DataSources.md#Labor-And-Employment-Data)
- [Politics](sections/07-DataSources.md#Politics)
- [Retail](sections/07-DataSources.md#Retail)
- [Social](sections/07-DataSources.md#Social)
- [Travel And Transportation](sections/07-DataSources.md#Travel-And-Transportation)
- [Various Portals](sections/07-DataSources.md#Various-Portals)
- [Source Articles and Blog Posts](sections/07-DataSources.md#Source-Articles-and-Blog-Posts)
- [Free Data Sources Data Science](sections/07-DataSources.md)
## 1001 Interview Questions
- [Interview Questions](sections/08-InterviewQuestions.md)
## Recommended Books, Courses, and Podcasts
- [About Books and Courses](sections/09-BooksAndCourses.md#about-books-and-courses)
- [Books](sections/09-BooksAndCourses.md#books)
- [Languages](sections/09-BooksAndCourses.md#books-languages)
- [Data Tools & Platforms](sections/09-BooksAndCourses.md#books-data-science-tools)
- [Business](sections/09-BooksAndCourses.md#Books-Business)
- [Community Recommendations](sections/09-BooksAndCourses.md#Community-Recommendations)
- [Online Courses](sections/09-BooksAndCourses.md#online-courses)
- [Preparation courses](sections/09-BooksAndCourses.md#Preparation-courses)
- [Data engineering courses](sections/09-BooksAndCourses.md#Data-engineering-courses)
- [Certifications](sections/09-BooksAndCourses.md#Certifications)
- [Podcasts](sections/09-BooksAndCourses.md#Podcasts)
- [Super Data Science](sections/09-BooksAndCourses.md#Super-Data-Science)
- [Data Skeptic](sections/09-BooksAndCourses.md#Data-Skeptic)
- [Data Engineering Podcast](sections/09-BooksAndCourses.md#Data-Engineering-Podcast)
- [Roaring Elephant BiteSized Big Tech](sections/09-BooksAndCourses.md#Roaring-Elephant-BiteSized-Big-Tech)
- [SQL Data Partners Podcast](sections/09-BooksAndCourses.md#SQL-Data-Partners-Podcast)
## How To Contribute
If you have some cool links or topics for the cookbook, please become a contributor.
Simply pull the repo, add your ideas and create a pull request.
You can also open an issue and put your thoughts there.
Please use the "Issues" function for comments.
## Important Links
Subscribe to my YouTube channel for regular updates:
[Link to YouTube](https://www.youtube.com/channel/UCY8mzqqGwl5_bTpBY9qLMAA)
I have a Medium publication where you can publish your data engineer articles to reach more people:
[Medium publication](https://link.medium.com/9oi1VDrhPW)
*(As an Amazon Associate I earn from qualifying purchases from Amazon
This is free of charge for you, but super helpful for supporting this channel)
================================================
FILE: sections/01-Introduction.md
================================================
Introduction
============
## Contents
- [What is this Cookbook](01-Introduction.md#what-is-this-cookbook)
- [Data Engineers](01-Introduction.md#data-engineers)
- [My Data Science Platform Blueprint](01-Introduction.md#my-data-science-platform-blueprint)
- [Connect](01-Introduction.md#connect)
- [Buffer](01-Introduction.md#buffer)
- [Processing Framework](01-Introduction.md#processing-framework)
- [Store](01-Introduction.md#store)
- [Visualize](01-Introduction.md#visualize)
- [Who Companies Need](01-Introduction.md#who-companies-need)
- [How to Learn Data Engineering](01-Introduction.md#how-to-learn-data-engineering)
- [Andreas interview on the Super Data Science Podcast](01-Introduction.md#Interview-with-Andreas-on-the-Super-Data-Science-Podcast)
- [Building Blocks to Learn Data Engineering](01-Introduction.md#building-blocks-to-learn-data-engineering)
- [Roadmap for Beginners](01-Introduction.md#roadmap-for-data-analysts)
- [Roadmap for Data Analysts](01-Introduction.md#roadmap-for-data-analysts)
- [Roadmap for Data Scientists](01-Introduction.md#roadmap-for-data-scientists)
- [Roadmap for Software Engineers](01-Introduction.md#roadmap-for-software-engineers)
- [Data Engineers Skills Matrix](01-Introduction.md#data-engineers-skills-matrix)
- [How to Become a Senior Data Engineer](01-Introduction.md#how-to-become-a-senior-data-engineer)
## What is this Cookbook
I get asked a lot:
"What do you actually need to learn to become an awesome data engineer?"
Well, look no further. You'll find it here!
If you are looking for AI algorithms and such data scientist things,
this book is not for you.
**How to use this Cookbook:**
This book is intended to be a starting point for you. It is not a training! I want to help you to identify the topics to look into to become an awesome data engineer in the process.
It hinges on my Data Science Platform Blueprint. Check it out below. Once you understand it, you can find in the book tools that fit into each key area of a Data Science platform (Connect, Buffer, Processing Framework, Store, Visualize).
Select a few tools you are interested in, then research and work with them.
Don't learn everything in this book! Focus.
**What types of content are in this book?**
You are going to find five types of content in this book: Articles
I wrote, links to my podcast episodes (video & audio), more than 200
links to helpful websites I like, data engineering interview questions
and case studies.
**This book is a work in progress!**
As you can see, this book is not finished. I'm constantly adding new
stuff and doing videos for the topics. But, obviously, because I do this
as a hobby, my time is limited. You can help make this book even
better.
**Help make this book awesome!**
If you have some cool links or topics for the cookbook, please become a
contributor on GitHub: . Fork the
repo, add them, and create a pull request. Or join the discussion by
opening Issues. Tell me your thoughts, what you value,
what you think should be included, or correct me where I am wrong.
You can also write me an email any time to
plumbersofdatascience\@gmail.com anytime.
**This Cookbook is and will always be free!**
## If You Like This Book & Need More Help:
Check out my Data Engineering Academy at LearnDataEngineering.com
**Visit learndataengineering.com:** [Click Here](https://learndataengineering.com)
- Huge Step by step Data Engineering Academy with over 30 courses
- Unlimited access incl. future courses during subsciption
- Access to all courses and example projects in the Academy
- Associate Data Engineer Certification
- Data Engineering on AWS E-Commerce example project
- Microsoft Azure example project
- Document Streaming example project with Docker, FastAPI, Apache Kafka, Apache Spark,
- MongoDB and Streamlit
- Time Series example project with InfluxDB and Grafana
- Lifetime access to the private Discord Workspace
- Course certificates
- Currently over 54 hours of videos
## Support This Book For Free!
- **Amazon:** [Click Here](https://www.amazon.com/shop/plumbersofdatascience) buy whatever you like from Amazon using this link* (Also check out my complete podcast gear and books)
## How To Contribute
If you have some cool links or topics for the cookbook, please become a contributor.
Simply pull the repo, add your ideas and create a pull request.
You can also open an issue and put your thoughts there.
Please use the "Issues" function for comments.
Data Engineers
-------------------------------
Data Engineers are the link between the management's data strategy
and the data scientists or analysts that need to work with data.
What they do is build the platforms that enable data scientists to do
their magic.
These platforms are usually used in five different ways:
- Data ingestion and storage of large amounts of data.
- Algorithm creation by data scientists.
- Automation of the data scientist's machine learning models and
algorithms for production use.
- Data visualization for employees and customers.
- Most of the time these guys start as traditional solution architects
for systems that involve SQL databases, web servers, SAP
installations and other "standard" systems.
But, to create big data platforms, the engineer needs to be an expert in
specifying, setting up, and maintaining big data technologies like:
Hadoop, Spark, HBase, Cassandra, MongoDB, Kafka, Redis, and more.
What they also need is experience on how to deploy systems on cloud
infrastructure like at Amazon or Google, or on-premise hardware.
| Podcast Episode: #048 From Wannabe Data Scientist To Engineer My Journey
|------------------|
|In this episode Kate Strachnyi interviews me for her humans of data science podcast. We talk about how I found out that I am more into the engineering part of data science.
| [Watch on YouTube](https://youtu.be/pIZkTuN5AMM) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/048-From-Wannabe-Data-Scientist-To-Engineer-My-Journey-e45i2o)|
## My Data Science Platform Blueprint
I have created a simple and modular big data platform
blueprint. It is based on what I have seen in the field and
read in tech blogs all over the internet.
Why do I believe it will be super useful to you? Because, unlike other blueprints, it is not focused on technology.
Following my blueprint will allow you to create the big data platform
that fits exactly your needs. Building the perfect platform will allow
data scientists to discover new insights. It will enable you to perfectly handle big data and allow you to make
data-driven decisions.
The blueprint is focused on the five key areas: Connect, Buffer, Processing Frameworks, Store, and Visualize.

Having the platform split like this turns it into a modular platform with
loosely coupled interfaces.
Why is it so important to have a modular platform?
If you have a platform that is not modular, you end up with something
that is fixed or hard to modify. This means you can not adjust the
platform to changing requirements of the company.
Because of modularity, it is possible to specifically select tools for your use case. It also allows you to replace every component, if you need it.
Now, lets talk more about each key area.
### Connect
Ingestion is all about getting the data in from the source and making it
available to later stages. Sources can be everything from tweets to server
logs, to IoT sensor data (e.g. from cars).
Sources send data to your API Services. The API is going to push the
data into temporary storage.
The temporary storage allows other stages simple and fast access to
incoming data.
A great solution is to use messaging queue systems like Apache Kafka,
RabbitMQ or AWS Kinesis. Sometimes people also use caches for
specialised applications like Redis.
A good practice is that the temporary storage follows the
publish-subscribe pattern. This way APIs can publish messages and
Analytics can quickly consume them.
### Buffer
In the buffer phase you have pub/sub systems like Apache Kafka, Redis, or other Cloud tools like Google pub/sub or AWS Kinesis.
These systems are more or less message Queues.
You put something in on one side and take it out on the other.
The idea behind buffers is to have an intermediate system for the incoming data.
How this works is, for instance, you're getting data in from from an API.
The API is publishing into the message queue. Data is buffered there until it is picked up by the processing.
If you don't have a buffer, you can run into problems when writing directly into a store or you're processing the data directly. You can always have peaks of incoming data that stall the systems.
Like, it's lunch break and people are working with your app way more than usual.
There's more data coming in very very fast, faster than the analytics of the storage can handle.
In this case, you would run into problems, because the whole system would stall. It would therefore take long to process the data, and your customers would be annoyed.
With a buffer, you buffer the incoming data. Processes for storage and analytics can take out only as much data as they can process. You are no longer in danger of overpowering systems.
Buffers are also really good for building pipelines.
You take data out of Kafka, pre-process it, and put it back into Kafka.
Then, with another analytics process, you take the processed data back out and put it into a store.
Ta-da! A pipeline.
### Processing Framework
The analyse stage is where the actual analytics is done in
the form of stream and batch processing.
Streaming data is taken from ingest and fed into analytics. Streaming
analyses the "live" data, thus generating fast results.
As the central and most important stage, analytics also has access to
the big data storage. Because of that connection, analytics can take a
big chunk of data and analyse it.
This type of analysis is called batch processing. It will deliver you
answers for the big questions.
For a short video about batch and stream processing and their use cases, click on the link below:
[Adding Batch to a Streaming Pipeline](https://www.youtube.com/watch?v=o-aGi3FmdfU)
The analytics process, batch or streaming, is not a one-way process.
Analytics can also write data back to the big data storage.
Oftentimes, writing data back to the storage makes sense. It allows you
to combine previous analytics outputs with the raw data.
Analytics give insights when you combine
raw data. This combination will often allow you to create even more
useful insights.
A wide variety of analytics tools are available. Ranging from MapReduce
or AWS Elastic MapReduce to Apache Spark and AWS lambda.
### Store
This is the typical big-data storage where you just store everything. It
enables you to analyse the big picture.
Most of the data might seem useless for now, but it is of utmost
importance to keep it. Throwing data away is a big no-no.
Why not throw something away when it is useless?
Although it seems useless for now, data scientists can work with the
data. They might find new ways to analyse the data and generate valuable
insights from it.
What kind of systems can be used to store big data?
Systems like Hadoop HDFS, Hbase, Amazon S3 or DynamoDB are a perfect fit
to store big data.
Check out my podcast how to decide between SQL and NoSQL:
### Visualize
Displaying data is as important as ingesting, storing, and analysing it.
Visualizations enable business users to make data-driven decisions.
This is why it is important to have a good visual presentation of the
data. Sometimes you have a lot of different use cases or projects using
the platform.
It might not be possible to build the perfect UI that fits
everyone's needs. What you should do in this case is enable others to build the
perfect UI themselves.
How to do that? By creating APIs to access the data and making them
available to developers.
Either way, UI or API, the trick is to give the display stage direct
access to the data in the big-data cluster. This kind of access will
allow the developers to use analytics results as well as raw data to
build the perfect application.
## Who Companies Need
For a company, it is important to have well-trained data engineers.
That's why companies are looking for people with experience of tools in every part of the above platform blueprint. One common theme I see is cloud platform experience on AWS, Azure or GCP.
## How to Learn Data Engineering
### Interview with Andreas on the Super Data Science Podcast
#### Summary
This interview with Andreas on Jon Krohn's Super Data Science podcast delves into the intricacies of data engineering, highlighting its critical role in the broader data science ecosystem. Andreas, calling from Northern Bavaria, Germany, shares his journey from a data analyst to becoming a renowned data engineering educator through his Learn Data Engineering Academy. The conversation touches upon the foundational importance of data engineering in ensuring data quality, scalability, and accessibility for data scientists and analysts.
Andreas emphasizes that the best data engineers often have a background in the companies domain/niche, which equips them with a deep understanding of the end user's needs. The discussion also explores the essential tools and skills required in the field, such as relational databases, APIs, ETL tools, data streaming with Kafka, and the significance of learning platforms like AWS, Azure, and GCP. Andreas highlights the evolving landscape of data engineering, with a nod towards the emergence of roles like analytics engineers and the increasing importance of automation and advanced data processing tools like Snowflake, Databricks, and DBT.
The interview is not just a technical deep dive but also a personal journey of discovery and passion for data engineering, underscoring the perpetual learning and adaptation required in the fast-evolving field of data science.
| Watch or listen to this interview -> 657: How to Learn Data Engineering — with Andreas Kretz
|------------------|
| Was super fun talking with Jon about Data Engineering on the podcast. Think this will be very helpful for you :)
| [Watch on YouTube](https://youtu.be/sbDFADS-zo8) / [Listen to the Podcast](https://www.superdatascience.com/podcast/how-to-learn-data-engineering)|
#### Q&A Highlights
**Q: What is data engineering, and why is it important?** A: Data engineering is the foundation of the data science process, focusing on collecting, cleaning, and managing data to make it accessible and usable for data scientists and analysts. It's crucial for automating data processes, ensuring data quality, and enabling scalable data analysis and machine learning models.
**Q: How does one transition from data analysis to data engineering?**
A: The transition involves gaining a deep understanding of data pipelines, learning to work with various data processing and management tools, and developing skills in programming languages and technologies relevant to data engineering, such as SQL, Python, and cloud platforms like AWS or Azure.
**Q: What are the key skills and tools for a data engineer?**
A: Essential skills include proficiency in SQL, experience with ETL tools, knowledge of programming languages like Python, and familiarity with cloud services and data processing frameworks like Apache Spark. Tools like Kafka for data streaming and platforms like Snowflake and Databricks are also becoming increasingly important.
**Q: Can you elaborate on the emerging role of analytics engineers?**
A: Analytics engineers focus on bridging the gap between raw data management and data analysis, working closely with data warehouses and using tools like dbt to prepare and model data for easy analysis. This role is pivotal in making data more accessible and actionable for decision-making processes.
**Q: What advice would you give to someone aspiring to become a data engineer?**
A: Start by mastering the basics of SQL and Python, then explore and gain experience with various data engineering tools and technologies. It's also important to understand the data science lifecycle and how data engineering fits within it. Continuous learning and staying updated with industry trends are key to success in this field.
**Q: How does a data engineer's role evolve with experience?**
A: A data engineer's journey typically starts with focusing on specific tasks or segments of data pipelines, using a limited set of tools. As they gain experience, they broaden their skill set, manage entire data pipelines, and take on more complex projects. Senior data engineers often lead teams, design data architectures, and collaborate closely with data scientists and business stakeholders to drive data-driven decisions.
**Q: What distinguishes data engineering from machine learning engineering?**
A: While both fields overlap, especially in the use of data, data engineering focuses on the infrastructure and processes for handling data, ensuring its quality and accessibility. Machine learning engineering, on the other hand, centers on deploying and maintaining machine learning models in production environments. A strong data engineering foundation is essential for effective machine learning engineering.
**Q: Why might a data analyst transition to data engineering?**
A: Data analysts may transition to data engineering to work on more technical aspects of data handling, such as building and maintaining data pipelines, automating data processes, and ensuring data scalability. This transition allows them to have a more significant impact on the data lifecycle and contribute to more strategic data initiatives within an organization.
**Q: Can you share a challenging project you worked on as a data engineer?**
A: One challenging project involved creating a scalable data pipeline for real-time processing of machine-generated data. The complexity lay in handling vast volumes of data, ensuring its quality, and integrating various data sources while maintaining high performance. This project highlighted the importance of selecting the right tools and technologies, such as Kafka for data streaming and Apache Spark for data processing, to meet the project's demands.
**Q: How does the cloud influence data engineering?**
A: Cloud platforms like AWS, Azure, and GCP have transformed data engineering by providing scalable, flexible, and cost-effective solutions for data storage, processing, and analysis. They offer a wide range of services and tools that data engineers can leverage to build robust data pipelines and infrastructure, facilitating easier access to advanced data processing capabilities and enabling more innovative data solutions.
**Q: What future trends do you see in data engineering?**
A: Future trends in data engineering include the increasing adoption of cloud-native services, the rise of real-time data processing and analytics, greater emphasis on data governance and security, and the continued growth of machine learning and AI-driven data processes. Additionally, tools and platforms that simplify data engineering tasks and enable more accessible data integration and analysis will become more prevalent, democratizing data across organizations.
**Q: How does the background of a data analyst contribute to their success as a data engineer?**
A: Data analysts have a unique advantage when transitioning to data engineering due to their understanding of data's end-use. Their experience in analyzing data gives them insights into what makes data valuable and usable, enabling them to design more effective and user-centric data pipelines and storage solutions.
**Q: What role does automation play in data engineering?**
A: Automation is crucial in data engineering for scaling data processes, reducing manual errors, and ensuring consistency in data handling. Automated data pipelines allow for real-time data processing and integration, making data more readily available for analysis and decision-making.
**Q: Can you discuss the significance of cloud platforms in data engineering?**
A: Cloud platforms like AWS, Azure, and GCP offer scalable, flexible, and cost-effective solutions for data storage, processing, and analysis. They provide data engineers with a suite of tools and services to build robust data pipelines, implement machine learning models, and manage large volumes of data efficiently.
**Q: How does data engineering support data science and machine learning projects?**
A: Data engineering lays the groundwork for data science and machine learning by preparing and managing the data infrastructure. It ensures that high-quality, relevant data is available for model training and analysis, thereby enabling more accurate predictions and insights.
**Q: What emerging technologies or trends should data engineers be aware of?**
A: Data engineers should keep an eye on the rise of machine learning operations (MLOps) for integrating machine learning models into production, the growing importance of real-time data processing and analytics, and the adoption of serverless computing for more efficient resource management. Additionally, technologies like containerization (e.g., Docker) and orchestration (e.g., Kubernetes) are becoming critical for deploying and managing scalable data applications.
**Q: What challenges do data engineers face, and how can they be addressed?**
A: Data engineers often grapple with data quality issues, integrating disparate data sources, and scaling data infrastructure to meet growing data volumes. Addressing these challenges requires a solid understanding of data architecture principles, continuous monitoring and testing of data pipelines, and adopting best practices for data governance and management.
**Q: How important is collaboration between data engineers and other data professionals?**
A: Collaboration is key in the data ecosystem. Data engineers need to work closely with data scientists, analysts, and business stakeholders to ensure that data pipelines are aligned with business needs and analytical goals. Effective communication and a shared understanding of data objectives are vital for the success of data-driven projects.
### Building Blocks to Learn Data Engineering
The following Roadmaps all hinge on the courses in my Data Engineering Academy. They are designed to help students who come from many different professions and enable to build a customized curriculum.
Here are all the courses currently available February 2024:
**Colors:** Blue (The Basics), Green (Platform & Pipeline Fundamentals), Orange (Fundamental Tools), Red (Example Projects)

### Roadmap for Beginners
Start this roadmap at my Academy: [Start Today](https://learndataengineering.com/p/data-engineering-for-beginners)
#### 11-Week Data Engineering Roadmap for Beginners & Graduates
#### Master the Fundamentals and Build Your First Data Pipelines
#### Starting in Data Engineering
Starting in data engineering can feel overwhelming, especially if you’re coming from a non-technical background or have only limited experience with coding and databases.
This 11-week roadmap, with a time commitment of 5–10 hours per week, is designed to help you build strong foundations in data engineering, step by step, before moving into cloud platforms and more advanced pipelines. You’ll learn essential concepts, hands-on coding, data modeling, and cloud ETL development—everything you need to kickstart your career as a data engineer.
---
#### Why This Roadmap is for You
- You’re just starting in data engineering and need a clear learning path
- You want to build a strong foundation in data platforms, SQL, and Python
- You need hands-on experience with data modeling, cloud ETL, and automation
- You want to work on real-world projects that prepare you for a data engineering job
By the end of this roadmap, you’ll have the skills, tools, and project experience to confidently apply for entry-level data engineering roles and start your career in the field.

---
#### What You’ll Achieve in This Roadmap
This roadmap is structured to help you understand the full data engineering workflow: from learning the fundamentals of data platforms and modeling to working with Python, SQL, and cloud-based ETL pipelines.
#### Learning Goals
| Goal | Description |
| ----------- | --------------------------------------------------- |
| **Goal #1** | Gain Experience in Data Platforms & Pipeline Design |
| **Goal #2** | Work with Data Like a Data Engineer Using Python & SQL |
| **Goal #3** | Learn Dimensional Data Modeling & Data Warehousing with Snowflake |
| **Goal #4** | Gain Experience with ELT Using dbt & Orchestration with Airflow |
| **Goal #5** | Build Your First ETL Pipeline on a Cloud Platform |
---
#### 11-Week Learning Roadmap
| Week | Topic | Key Learning Outcomes |
| --------------- | ----------------------------------------- | ------------------------------------------------------------------------------- |
| **Week 1** | Introduction & Platform & Pipeline Design | Understand data platforms, data pipelines, and the tools used in data engineering |
| **Week 2** | Relational Data Modeling | Develop skills in creating relational data models for structured data |
| **Week 3 & 4** | Python for Data Engineers | Learn Python for data processing, data manipulation, and pipeline development |
| **Week 5** | Advanced SQL | Gain expertise in querying, storing, and manipulating data in relational databases |
| **Week 6** | Dimensional Data Modeling | Master the techniques of dimensional modeling for analytics and reporting |
| **Week 7** | Snowflake Data Warehousing | Learn how to use Snowflake as a cloud data warehouse |
| **Week 8** | Data Transformation with dbt | Transform and model data efficiently using dbt |
| **Week 9** | Data Pipeline Orchestration with Airflow | Automate and manage data workflows using Apache Airflow |
| **Week 10 & 11**| End-to-End Project on AWS, Azure, or GCP | Complete an end-to-end project on a cloud platform of your choice |
---
#### Week 1: Introduction & Platform & Pipeline Design
##### 1. Learn the Basics of Platform & Pipeline Design
##### Data Platform and Pipeline Design
**Learn how to build data pipelines with templates and examples for Azure, GCP, and Hadoop**
##### Description
Data pipelines are the backbone of any Data Science platform. They are essential for data ingestion, processing, and machine learning workflows. This training will help you understand how to create stream and batch processing pipelines as well as machine learning pipelines by going through the most essential basics—complemented by templates and examples for useful cloud computing platforms.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-pipeline-design)
##### Detailed Course Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| **Platform & Pipeline Basics** | The Platform Blueprint | 10:11 |
| | Data Engineering Tools Guide | 2:44 |
| | End-to-End Pipeline Example | 6:18 |
| **Ingestion Pipelines** | Push Ingestion Pipelines | 3:42 |
| | Pull Ingestion Pipelines | 3:34 |
| **Pipeline Types** | Batch Pipelines | 3:07 |
| | Streaming Pipelines | 3:34 |
| **Visualization** | Stream Analytics | 2:26 |
| | Visualization Pipelines | 3:47 |
| | Visualization with Hive & Spark on Hadoop | 6:21 |
| | Visualization Data via Spark Thrift Server | 3:27 |
| **Platform Examples** | AWS, Azure, GCP (Currently Slides Only) | START |
---
##### 2. Get to Know the Different Data Stores
##### Choosing Data Stores
**Learn the different types of data storages and when to use which**
##### Description
One part of creating a data platform and pipelines is to choose data stores, which is the focus of this training. You will learn about relational databases, NoSQL databases, data warehouses, and data lakes. The goal is to help you understand when to use each type of data storage and how to incorporate them into your pipeline.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/choosing-data-stores)
##### Detailed Course Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| | What are Data Stores? | 2:09 |
| **Data Stores Basics** | OLTP vs OLAP | 7:34 |
| | ETL vs ELT | 5:45 |
| | Data Stores Ranking | 4:05 |
| **Relational Databases** | How to Choose Data Stores | 8:11 |
| | Relational Databases Concepts | 6:34 |
| **NoSQL Databases** | NoSQL Basics | 10:39 |
| | Document Stores | 5:56 |
| | Time Series Databases | 5:00 |
| | Search Engines | 4:18 |
| | Wide Column Stores | 4:22 |
| | Key Value Stores | 4:59 |
| | Graph Databases | 1:05 |
| **Data Warehouses & Data Lakes** | Data Warehouses | 5:32 |
| | Data Lakes | 7:10 |
---
#### 3. See Data Modeling Examples for the Learned Data Stores
##### Data Modeling 1
**Learn how to design schemas for SQL, NoSQL, and Data Warehouses**
##### Description
Schema design is a critical skill for data engineers. This training covers schema design for different data stores using an e-commerce dataset. You will see examples of how the same dataset is modeled for relational databases, NoSQL stores, wide column stores, document stores, key-value stores, and data warehouses. This will help you understand how to create maintainable models and avoid data swamps.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-modeling)
##### Detailed Course Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| | Why Data Modeling Is Important | 5:44 |
| | A Good Dataset | 1:28 |
| **Relational Databases** | Schema Design | 9:27 |
| **Wide Column Stores** | Schema Design | 7:35 |
| **Document Stores** | Schema Design | 7:28 |
| **Key Value Stores** | Schema Design | 4:49 |
| **Data Warehouses** | Schema Design | 4:44 |
| **Data Modeling Workshop** | November 2024 | 101:49 |
---
#### Week 2: Relational Data Modeling
##### Start with Relational Data Modeling
**Relational Data modeling** is an essential skill, as even in modern "big data" environments, relational databases are often used for managing and serving metadata. This week focuses on building a strong foundation in relational data modeling, which is crucial for structuring data effectively and optimizing query performance.
##### Relational Data Modeling
**Learn the most important basics to create a data model for OLTP data stores**
###### Description
This course covers everything you need to know about relational data modeling—from understanding entities, attributes, and relationships to normalizing data models up to the third normal form (3NF). You will learn how to design conceptual, logical, and physical data models, implement primary and foreign keys, and ensure data quality through constraints and validations. Practical exercises include setting up a MySQL server with Docker and creating ER diagrams using MySQL Workbench.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/relational-data-modeling)
##### Detailed Course Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| **Basics and Prepare the Environment** | Relational Data Models History | 3:16 |
| | Installing MySQL Server and MySQL Workbench | 8:04 |
| | MySQL Workbench Introduction | 4:36 |
| **Create the Conceptual Data Model** | The Design Process Explained | 4:14 |
| | Discover the Entities | 10:24 |
| | Discover the Attributes | 13:09 |
| | Define Entity Relationships and Normalize the Data | 11:19 |
| **Defining and Resolving Relationships** | Identifying vs Non-Identifying Relationships | 2:01 |
| | How to Resolve Many-to-Many Relationships | 4:00 |
| | How to Resolve One-to-Many Relationships | 2:34 |
| | How to Resolve One-to-One Relationships | 1:45 |
| **Hands-On Workbench - Creating the Database** | Create Your ER Diagram Using Workbench | 19:46 |
| | Create a Physical Data Model | 4:13 |
| | Populate the MySQL DB with Data from .xls File | 15:13 |
---
#### Week 3 & 4: Python for Data Engineers
##### Description
This course offers a comprehensive guide to using Python for data engineering tasks. You’ll learn advanced Python features, including data processing with Pandas, working with APIs, interacting with PostgreSQL databases, and handling data types like JSON. The course also covers important programming concepts like exception handling, modules, unit testing, and object-oriented programming—all within the context of data engineering.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/python-for-data-engineers)
##### Detailed Course Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| **Advanced Python** | Classes | 4:37 |
| | Modules | 3:06 |
| | Exception Handling | 8:55 |
| | Logging | 5:12 |
| **Data Engineering** | Datetime | 8:04 |
| | JSON | 9:54 |
| | JSON Validation | 15:10 |
| | UnitTesting | 16:44 |
| | Pandas: Intro & Data Types | 8:43 |
| | Pandas: Appending & Merging DataFrames | 7:49 |
| | Pandas: Normalizing & Lambdas | 4:12 |
| | Pandas: Pivot & Parquet Write, Read | 6:17 |
| | Pandas: Melting & JSON Normalization | 8:15 |
| | Numpy | 4:47 |
| **Working with Data Sources/Sinks** | Requests (Working with APIs) | 11:15 |
| | Working with Databases: Setup | 4:06 |
| | Working with Databases: Tables, Bulk Load, Queries | 8:12 |
---
#### Week 5: SQL for Data Engineers
##### Description
SQL is the backbone of working with relational databases, and if you’re getting into Data Engineering, mastering SQL is a must. This course provides the essential SQL skills needed to work with databases effectively. You'll learn how to manage data, build efficient queries, and perform advanced operations to handle real-world data challenges.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/sql-for-data-engineers)
##### Detailed Course Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| **Basics** | Database Management Systems & SQL | 3:49 |
| | The Chinook Database | 3:03 |
| | SQLite Installation | 7:02 |
| | DBeaver Installation | 4:08 |
| | Data Types in SQLite | 6:15 |
| **Basic SQL** | DML & DDL | 15:06 |
| | Select Statements | 6:03 |
| | Grouping & Aggregation | 10:12 |
| | Joins | 10:05 |
| **Advanced SQL** | TCP Transaction Control Language | 6:42 |
| | Common Table Expressions & Subqueries | 10:26 |
| | Window Functions 1: Concept & Syntax | 5:00 |
| | Window Functions 2: Aggregate Functions | 7:24 |
| | Window Functions 3: Ranking Functions | 6:05 |
| | Window Functions 4: Analytical Functions | 7:20 |
| **Optimization** | Query Optimization | START |
| | Indexing Best Practices | START |
---
#### Week 6: Dimensional Data Modeling
##### Description
Dimensional data modeling is a crucial skill for data engineers working with analytics use-cases where data needs to be structured efficiently for reporting and business insights. This course covers the basics of dimensional modeling, the medallion architecture, and how to create data models for OLAP data stores.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-modeling-3-dimensional-data-modeling)
##### Detailed Course Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| | Data Warehousing Basics | 6:42 |
| **Dimensional Modeling Basics** | Approaches to building a data warehouse | 5:20 |
| | Dimension tables explained | 5:34 |
| | Fact tables explained | 6:34 |
| | Identifying dimensions | 3:16 |
| **Data Warehouse Setup** | What is DuckDB | 5:58 |
| | First DuckDB hands-on | 2:20 |
| | Creating tables in DuckDB | 2:40 |
| | Installing DBeaver | 6:49 |
| **Working With The Data Warehouse** | Exploring SCD0 and SCD1 | 19:57 |
| | Exploring SCD2 | 13:52 |
| | Exploring transaction fact table | 6:28 |
| | Exploring accumulating fact table | 7:17 |
---
#### Week 7: Snowflake for Data Engineers
##### Description
Snowflake is a highly popular cloud-based data warehouse that is ideal for beginners due to its simplicity and powerful features. In this course, you will learn how to set up Snowflake, load and process data, and create visualizations. The course covers both SQL and Python methods for managing data within Snowflake, and provides hands-on experience with connecting Snowflake to other tools such as PowerBI.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/snowflake-for-data-engineers)
##### Detailed Course Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| **Introduction** | Snowflake basics | 4:16 |
| | Data Warehousing basics | 4:13 |
| | How Snowflake fits into data platforms | 3:14 |
| **Setup** | Snowflake Account setup | 4:24 |
| | Creating your warehouse & UI overview | 4:15 |
| **Loading CSVs from your PC** | Our dataset & goals | 3:01 |
| | Setup Snowflake database | 10:29 |
| | Preparing the upload file | 8:31 |
| | Using internal stages with SnowSQL | 12:37 |
| | Splitting a data table into two tables | 6:38 |
| **Visualizing Data** | Creating a visualization worksheet | 7:08 |
| | Creating a dashboard | 5:23 |
| | Connect PowerBI to Snowflake | 6:03 |
| | Query data with Python | 7:35 |
| **Automation** | Create import task | 9:18 |
| | Create table refresh task | 3:40 |
| | Test our pipeline | 3:14 |
| **AWS S3 Integration** | Working with external stages for AWS S3 | 10:20 |
| | Implementing snowpipe with S3 | 6:19 |
---
#### Week 8: dbt for Data Engineers
##### Description
This course introduces dbt (Data Build Tool), a SQL-first transformation workflow that allows you to transform, test, and document data directly within your data warehouse. You will learn how to set up dbt, connect it with Snowflake, create data pipelines, and implement advanced features like CI/CD and documentation generation. This training is ideal for data engineers looking to build trusted datasets for reporting, machine learning, and operational workflows.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/dbt-for-data-engineers)
##### Detailed Course Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| **dbt Introduction & Setup** | Modern data experience | 5:42 |
| | Introduction to dbt | 4:38 |
| | Goals of this course | 4:50 |
| | Snowflake preparation | 7:29 |
| | Loading data into Snowflake | 4:48 |
| | Setup dbt Core | 9:35 |
| | Preparing the GitHub repository | 3:32 |
| **Working with dbt-Core** | dbt models & materialization explained | 6:16 |
| | Creating your first SQL model | 5:48 |
| | Working with custom schemas | 5:28 |
| | Creating your first Python model | 4:35 |
| | dbt sources | 1:55 |
| | Configuring sources | 4:03 |
| | Working with seed files | 4:20 |
| **Tests in dbt** | Generic tests | 3:19 |
| | Tests with Great Expectations | 3:25 |
| | Writing custom generic tests | 2:49 |
| **Working with dbt-Cloud** | dbt cloud setup | 7:25 |
| | Creating dbt jobs | 5:14 |
| | CI/CD automation with dbt cloud and GitHub | 10:52 |
| | Documentation in dbt | 7:38 |
---
#### Week 9: Apache Airflow Workflow Orchestration
##### Description
Airflow is a platform-independent workflow orchestration tool that offers many possibilities to create and monitor stream and batch pipeline processes. It supports complex, multi-stage processes across major platforms and tools in the data engineering world, such as AWS or Google Cloud. Airflow is not only great for planning and organizing your processes but also provides robust monitoring capabilities, allowing you to keep track of data workflows and troubleshoot effectively.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/learn-apache-airflow)
##### Detailed Course Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| **Airflow Workflow Orchestration** | Airflow Usage | 3:19 |
| **Airflow Fundamental Concepts** | Fundamental Concepts | 2:47 |
| | Airflow Architecture | 3:09 |
| | Example Pipelines | 4:49 |
| | Spotlight 3rd Party Operators | 2:17 |
| | Airflow XComs | 4:32 |
| **Hands-On Setup** | Project Setup | 1:43 |
| | Docker Setup Explained | 2:06 |
| | Docker Compose & Starting Containers | 4:23 |
| | Checking Services | 1:48 |
| | Setup WeatherAPI | 1:33 |
| | Setup Postgres DB | 1:58 |
| **Learn Creating DAGs** | Airflow Webinterface | 4:37 |
| | Creating DAG With Airflow 2.0 | 9:46 |
| | Running our DAG | 4:15 |
| | Creating DAG With TaskflowAPI | 6:59 |
| | Getting Data From the API With SimpleHTTPOperator | 3:38 |
| | Writing into Postgres | 4:12 |
| | Parallel Processing | 4:15 |
---
#### Week 10 & 11: End-to-End Project on AWS, Azure, or GCP
##### Important: Choose One Project
Participants need to select **one** of the following cloud platforms to complete their end-to-end data engineering project. It is not necessary to complete all three projects.
##### AWS Project Introduction
The AWS project is designed for those who want to get started with cloud platforms, particularly with Amazon Web Services, the leading platform in data processing. This project will guide you through setting up an end-to-end data engineering pipeline using AWS tools like Lambda, API Gateway, Glue, Redshift, Kinesis, and DynamoDB. You will work with an e-commerce dataset, learn data modeling, and implement both stream and batch processing pipelines.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-engineering-on-aws)
##### Detailed AWS Project Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| | Data Engineering | 4:15 |
| | Data Science Platform | 5:20 |
| **The Dataset** | Data Types You Encounter | 3:03 |
| | What Is A Good Dataset | 2:54 |
| | The Dataset We Use | 3:16 |
| | Defining The Purpose | 6:27 |
| | Relational Storage Possibilities | 3:46 |
| | NoSQL Storage Possibilities | 6:28 |
| **Platform Design** | Selecting The Tools | 3:49 |
| | Client | 3:05 |
| | Connect | 1:18 |
| | Buffer | 1:28 |
| | Process | 2:42 |
| | Store | 3:41 |
| | Visualize | 3:00 |
| **Data Pipelines** | Data Ingestion Pipeline | 3:00 |
| | Stream To Raw Storage Pipeline | 2:19 |
| | Stream To DynamoDB Pipeline | 3:09 |
| | Visualization API Pipeline | 2:56 |
| | Visualization Redshift Data Warehouse Pipeline | 5:29 |
| | Batch Processing Pipeline | 3:19 |
| **AWS Basics** | Create An AWS Account | 1:58 |
| | Things To Keep In Mind | 2:45 |
| | IAM Identity & Access Management | 4:06 |
| | Logging | 2:22 |
| | AWS Python API Boto3 | 2:57 |
| **Data Ingestion Pipeline** | Development Environment | 4:02 |
| | Create Lambda for API | 2:33 |
| | Create API Gateway | 8:30 |
| | Setup Kinesis | 1:38 |
| | Setup IAM for API | 5:00 |
| | Create Ingestion Pipeline (Code) | 6:09 |
| | Create Script to Send Data | 5:46 |
| | Test The Pipeline | 4:53 |
| **Stream To Raw S3 Storage Pipeline** | Setup S3 Bucket | 3:42 |
| | Configure IAM For S3 | 3:21 |
| | Create Lambda For S3 Insert | 7:16 |
| | Test The Pipeline | 4:01 |
| **Stream To DynamoDB Pipeline** | Setup DynamoDB | 9:00 |
| | Setup IAM For DynamoDB Stream | 3:36 |
| | Create DynamoDB Lambda | 9:20 |
| **Visualization API** | Create API & Lambda For Access | 6:10 |
| | Test The API | 4:47 |
| **Visualization Pipeline Redshift Data Warehouse** | Setup Redshift Data Warehouse | 8:08 |
| | Security Group For Firehose | 3:12 |
| | Create Redshift Tables | 5:51 |
| | S3 Bucket & jsonpaths.json | 3:02 |
| | Configure Firehose | 7:58 |
| | Debug Redshift Streaming | 7:43 |
| | Bug-fixing | 5:58 |
| | Power BI | 12:16 |
| **Batch Processing Pipeline** | AWS Glue Basics | 5:14 |
| | Glue Crawlers | 13:09 |
| | Glue Jobs | 13:43 |
| | Redshift Insert & Debugging | 7:16 |
---
##### Azure Project Introduction
The Azure project is designed for those who want to build a streaming data pipeline using Microsoft Azure's robust cloud platform. This project introduces you to Azure services such as APIM, Blob Storage, Azure Functions, Cosmos DB, and Power BI. You will gain practical experience by building a pipeline that ingests, processes, stores, and visualizes data, using Python and Visual Studio Code.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/build-streaming-data-pipelines-in-azure)
##### Detailed Azure Project Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| **Project Introduction** | Data Engineering in Azure - Streaming Data Pipelines | 2:43 |
| **Datasets and Local Preprocessing** | Introduction to Datasets and Local Preprocessing | 7:06 |
| | Deploying Code on Visual Studio to Docker Containers | 5:27 |
| **Azure Functions and Blob Storage** | Develop Azure Functions via Python and VS Code | 5:52 |
| | Deploy Azure Function to Azure Function App and Test It | 6:26 |
| | Integrate Azure Function with Blob Storage via Bindings | 4:58 |
| **Add Azure Function to Azure API Management (APIM)** | Expose Azure Function as a Backend | 7:05 |
| | Securely Store Secrets in Azure Key Vault | 4:41 |
| | Add Basic Authentication in API Management | 4:35 |
| | Test APIM and Imported Azure Function via Local Python Program | 2:34 |
| **Create and Combine Event Hubs, Azure Function, and Cosmos DB** | Create Event Hubs and Test Capture Events Feature | 6:59 |
| | Modify Existing Azure Function to Include Event Hubs Binding | 6:42 |
| **Write Tweets to Cosmos DB (Core SQL) from Event Hub** | Create a Cosmos DB (Core SQL) | 9:03 |
| | Create a New Azure Function that Writes Messages to Cosmos DB | 9:03 |
| **Connect Power BI Desktop to Your Cosmos DB** | Connect Power BI Desktop via Connector and Create a Dashboard | 6:32 |
---
##### GCP Project Introduction
The GCP project is designed for those who want to learn how to build, manage, and optimize data pipelines on Google Cloud Platform. This project focuses on building an end-to-end pipeline that extracts data from an external weather API, processes it through GCP's data tools, and visualizes the results using Looker Studio. This project offers practical, hands-on experience with tools like Cloud SQL, Compute Engine, Cloud Functions, Pub/Sub, and Looker Studio.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-engineering-on-gcp)
##### Detailed GCP Project Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| **Introduction** | Introduction | 1:13 |
| | GitHub & the Team | 1:30 |
| **Data & Goals** | Architecture of the Project | 3:19 |
| | Introduction to Weather API | 2:18 |
| | Setup Google Cloud Account | 2:12 |
| **Project Setup** | Creating the Project | 2:35 |
| | Enabling the Required APIs | 1:34 |
| | Configure Scheduling | 2:20 |
| **Pipeline Creation - Extract from API** | Setup VM for Database Interaction | 2:53 |
| | Setup MySQL Database | 2:16 |
| | Setup VM Client and Create Database | 2:46 |
| | Creating Pub/Sub Message Queue | 1:41 |
| | Create Cloud Function to Pull Data from API | 4:17 |
| | Explanation of Code to Pull from API | 4:20 |
| **Pipeline Creation - Write to Database** | Create Function to Write to Database | 7:47 |
| | Explanation of Code to Write Data to Database | 5:56 |
| | Testing the Function | 5:51 |
| | Create Function Write Data to DB - Pull | 3:53 |
| | Explanation Code Write Data to DB - Pull | 4:33 |
| **Visualization** | Setup Looker Studio and Create Bubble Chart | 2:20 |
| | Setup Looker Studio and Create Time Series Chart | 1:57 |
| | Pipeline Monitoring | 6:20 |
---
##### What’s Next?
After completing this roadmap, you’ll have the confidence and skills to not just analyze data but to engineer and optimize it like a pro! Explore advanced topics, start contributing to projects, and showcase your new skills to potential employers.
### Roadmap for Data Analysts
Start this roadmap at my Academy: [Start Today](https://learndataengineering.com/p/data-engineering-for-data-analysts)
#### Go Beyond SQL and Learn How to Build, Automate, and Optimize Data Pipelines Like an Engineer
#### Who Is This 10 Week Roadmap For?
- Data Analysts who want to understand the full data lifecycle
- Those looking to move beyond SQL and build real data pipelines
- Professionals seeking hands-on, practical experience to boost their resumes
- Anyone wanting to stay competitive in the job market
#### What You’ll Achieve
This roadmap provides a step-by-step approach to mastering data engineering skills. You'll start with Python and data modeling, move on to building pipelines, work with cloud platforms, and finally automate workflows using industry-standard tools.

---
#### Learning Goals
| Goal | Description |
| ----------- | --------------------------------------------------- |
| **Goal #1** | Master Python & Relational Data Modeling |
| **Goal #2** | Build Your First ETL Pipeline on AWS (or Azure/GCP) |
| **Goal #3** | Gain Hands-On Experience with Snowflake & dbt |
| **Goal #4** | Connect AWS and Snowflake |
| **Goal #5** | Automate Your Data Pipeline with Airflow |
---
#### 10-Week Learning Roadmap
| Week | Topic | Key Learning Outcomes |
| --------------- | ----------------------------------------- | ------------------------------------------------------------------------------- |
| **Week 1** | Introduction to Data Engineering & Python | Understand core concepts of data engineering and Python programming basics |
| **Week 2** | Platform & Pipeline Design | Learn how to design effective data platforms and pipelines |
| **Week 3** | Relational Data Modeling | Develop skills in creating relational data models for structured data |
| **Week 4** | Dimensional Data Modeling | Master the techniques of dimensional modeling for analytics and reporting |
| **Week 5** | Docker Fundamentals & APIs | Get hands-on with containerization using Docker and working with APIs |
| **Week 8** | Working with Snowflake | Gain practical experience using Snowflake as a data warehouse |
| **Week 9** | Transforming Data With dbt | Learn to transform and model data efficiently using dbt |
| **Week 10** | Pipeline Orchestration with Airflow | Automate and manage data workflows using Apache Airflow |
---
#### Detailed Weekly Content
#### Week 1: Introduction to Data Engineering & Python
If you want to take your data engineering skills to the next level, you are in the right place. Python has become the go-to language for data analysis and machine learning, and with our training, you will learn how to successfully use Python to build robust data pipelines and manipulate data efficiently.
This comprehensive training program is designed for data engineers of all levels. Whether you are just starting out in data engineering or you are an experienced engineer looking to expand your skill set, our Python for Data Engineers training will give you the tools you need to excel in your field.
At the end of the training, you will have a strong foundation in Python and data engineering and be ready to tackle complex data engineering projects with ease.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/python-for-data-engineers)
##### Course Curriculum
| Lesson | Duration |
|--------|----------|
| Classes | 4:37 |
| Modules | 3:06 |
| Exception Handling | 8:55 |
| Logging | 5:12 |
| Datetime | 8:04 |
| JSON | 9:54 |
| JSON Validation | 15:10 |
| UnitTesting | 16:44 |
| Pandas: Intro & data types | 8:43 |
| Pandas: Appending & Merging DataFrames | 7:49 |
| Pandas: Normalizing & Lambdas | 4:12 |
| Pandas: Pivot & Parquet write, read | 6:17 |
| Pandas: Melting & JSON normalization | 8:15 |
| Numpy | 4:47 |
| Requests (Working with APIs) | 11:15 |
| Working with Databases: Setup | 4:06 |
| Working with Databases: Tables, bulk load, queries | 8:12 |
---
#### Week 2: Platform & Pipeline Design
##### Description
Data pipelines are the number one thing within the Data Science platform. Without them, data ingestion or machine learning processing, for example, would not be possible.
This 110-minute long training will help you understand how to create stream and batch processing pipelines as well as machine learning pipelines by going through some of the most essential basics - complemented by templates and examples for useful cloud computing platforms.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-pipeline-design)
##### Course Curriculum
| Lesson | Duration |
|--------|----------|
| Platform Blueprint & End to End Pipeline Example | 10:11 |
| Data Engineering Tools Guide | 2:44 |
| End to End Pipeline Example | 6:18 |
| Push Ingestion Pipelines | 3:42 |
| Pull Ingestion Pipelines | 3:34 |
| Batch Pipelines | 3:07 |
| Streaming Pipelines | 3:34 |
| Stream Analytics | 2:26 |
| Lambda Architecture | 4:02 |
| Visualization Pipelines | 3:47 |
| Visualization with Hive & Spark on Hadoop | 6:21 |
| Visualization Data via Spark Thrift Server | 3:27 |
---
#### Week 3: Relational Data Modeling
##### Description
Relational modeling is often used for building transactional databases. You might say, 'But I'm not planning to become a back-end engineer'. Apart from knowing how to move data, you should also know how to store it effectively which involves designing a scalable data model optimized to drive faster query response time and efficiently retrieve data.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/relational-data-modeling)
##### Course Curriculum
| Lesson | Duration |
|--------|----------|
| Relational Data Models History | 3:16 |
| Installing MySQL Server and MySQL Workbench | 8:04 |
| MySQL Workbench Introduction | 4:36 |
| The Design Process Explained | 4:14 |
| Discover the Entities | 10:24 |
| Discover the Attributes | 13:09 |
| Define Entity Relationships and Normalize the Data | 11:19 |
| Identifying vs Non-identifying Relationships | 2:01 |
| Resolve Many-to-Many Relationships | 4:00 |
| Resolve One-to-Many Relationships | 2:34 |
| Resolve One-to-One Relationships | 1:45 |
| Create ER Diagram Using Workbench | 19:46 |
| Create a Physical Data Model | 4:13 |
| Populate MySQL DB with Data from .xls File | 15:13 |
| Course Conclusion | 1:28 |
---
#### Week 4: Dimensional Data Modeling
##### Description
In today’s data-driven world, efficient data organization is key to enabling insightful analysis and reporting. Dimensional data modeling is a crucial technique that helps structure your data for faster querying and better decision-making.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-modeling-3-dimensional-data-modeling)
##### Course Curriculum
| Lesson | Duration |
|--------|----------|
| Intro to Data Warehousing | 6:42 |
| Approaches to Building a Data Warehouse | 5:20 |
| Dimension Tables Explained | 5:34 |
| Fact Tables Explained | 6:34 |
| Identifying Dimensions | 3:16 |
| What is DuckDB | 5:58 |
| First DuckDB Hands-on | 2:20 |
| Creating Tables in DuckDB | 2:40 |
| Installing DBeaver | 6:49 |
| Exploring SCD0 and SCD1 | 19:57 |
| Exploring SCD2 | 13:52 |
| Exploring Transaction Fact Table | 6:28 |
| Exploring Accumulating Fact Table | 7:17 |
| Course Conclusion | 0:52 |
---
#### Week 5: Docker Fundamentals & APIs
##### Description
Week 5 covers two crucial topics: containerization using Docker and building APIs with FastAPI. Docker is essential for creating lightweight, self-sustained containers, while APIs are the backbone of data platforms.
Check out Docker Fundamentals in my Academy: [Learn More](https://learndataengineering.com/p/docker-fundamentals)
Check out Building APIs with FastAPI in my Academy: [Learn More](https://learndataengineering.com/p/apis-with-fastapi-course)
##### Course Curriculum
##### Docker Fundamentals
| Lesson | Duration |
|--------|----------|
| Docker vs Virtual Machines | 6:23 |
| Docker Terminology | 5:56 |
| Installing Docker Desktop | 4:09 |
| Pulling Images & Running Containers | 6:34 |
| Docker Compose | 6:34 |
| Build & Run Simple Image | 6:28 |
| Build Image with Dependencies | 5:05 |
| Using DockerHub Image Registry | 4:24 |
| Image Layers & Security Best Practices | 7:55 |
| Managing Docker with Portainer | 4:04 |
##### Building APIs with FastAPI
| Lesson | Duration |
|--------|----------|
| What are APIs? | 8:29 |
| Hosting vs Using APIs | 4:08 |
| HTTP Methods & Media Types | 6:56 |
| API Parameters & Response Codes | 9:40 |
| Setting up FastAPI | 4:55 |
| Creating APIs: POST, GET, PUT | 16:18 |
| Testing APIs with Postman | 4:22 |
| Deploying FastAPI with Docker | 6:01 |
| API Security Best Practices | 3:48 |
---
#### Week 6 & 7: End-to-End Project on AWS, Azure, or GCP
##### Important: Choose One Project
Participants need to select **one** of the following cloud platforms to complete their end-to-end data engineering project. It is not necessary to complete all three projects.
##### AWS Project Introduction
The AWS project is designed for those who want to get started with cloud platforms, particularly with Amazon Web Services, the leading platform in data processing. This project will guide you through setting up an end-to-end data engineering pipeline using AWS tools like Lambda, API Gateway, Glue, Redshift, Kinesis, and DynamoDB. You will work with an e-commerce dataset, learn data modeling, and implement both stream and batch processing pipelines.
Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/data-engineering-on-aws)
##### Detailed AWS Project Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| | Data Engineering | 4:15 |
| | Data Science Platform | 5:20 |
| **The Dataset** | Data Types You Encounter | 3:03 |
| | What Is A Good Dataset | 2:54 |
| | The Dataset We Use | 3:16 |
| | Defining The Purpose | 6:27 |
| | Relational Storage Possibilities | 3:46 |
| | NoSQL Storage Possibilities | 6:28 |
| **Platform Design** | Selecting The Tools | 3:49 |
| | Client | 3:05 |
| | Connect | 1:18 |
| | Buffer | 1:28 |
| | Process | 2:42 |
| | Store | 3:41 |
| | Visualize | 3:00 |
| **Data Pipelines** | Data Ingestion Pipeline | 3:00 |
| | Stream To Raw Storage Pipeline | 2:19 |
| | Stream To DynamoDB Pipeline | 3:09 |
| | Visualization API Pipeline | 2:56 |
| | Visualization Redshift Data Warehouse Pipeline | 5:29 |
| | Batch Processing Pipeline | 3:19 |
| **AWS Basics** | Create An AWS Account | 1:58 |
| | Things To Keep In Mind | 2:45 |
| | IAM Identity & Access Management | 4:06 |
| | Logging | 2:22 |
| | AWS Python API Boto3 | 2:57 |
| **Data Ingestion Pipeline** | Development Environment | 4:02 |
| | Create Lambda for API | 2:33 |
| | Create API Gateway | 8:30 |
| | Setup Kinesis | 1:38 |
| | Setup IAM for API | 5:00 |
| | Create Ingestion Pipeline (Code) | 6:09 |
| | Create Script to Send Data | 5:46 |
| | Test The Pipeline | 4:53 |
| **Stream To Raw S3 Storage Pipeline** | Setup S3 Bucket | 3:42 |
| | Configure IAM For S3 | 3:21 |
| | Create Lambda For S3 Insert | 7:16 |
| | Test The Pipeline | 4:01 |
| **Stream To DynamoDB Pipeline** | Setup DynamoDB | 9:00 |
| | Setup IAM For DynamoDB Stream | 3:36 |
| | Create DynamoDB Lambda | 9:20 |
| **Visualization API** | Create API & Lambda For Access | 6:10 |
| | Test The API | 4:47 |
| **Visualization Pipeline Redshift Data Warehouse** | Setup Redshift Data Warehouse | 8:08 |
| | Security Group For Firehose | 3:12 |
| | Create Redshift Tables | 5:51 |
| | S3 Bucket & jsonpaths.json | 3:02 |
| | Configure Firehose | 7:58 |
| | Debug Redshift Streaming | 7:43 |
| | Bug-fixing | 5:58 |
| | Power BI | 12:16 |
| **Batch Processing Pipeline** | AWS Glue Basics | 5:14 |
| | Glue Crawlers | 13:09 |
| | Glue Jobs | 13:43 |
| | Redshift Insert & Debugging | 7:16 |
---
##### Azure Project Introduction
The Azure project is designed for those who want to build a streaming data pipeline using Microsoft Azure's robust cloud platform. This project introduces you to Azure services such as APIM, Blob Storage, Azure Functions, Cosmos DB, and Power BI. You will gain practical experience by building a pipeline that ingests, processes, stores, and visualizes data, using Python and Visual Studio Code.
Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/build-streaming-data-pipelines-in-azure)
##### Detailed Azure Project Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| **Project Introduction** | Data Engineering in Azure - Streaming Data Pipelines | 2:43 |
| **Datasets and Local Preprocessing** | Introduction to Datasets and Local Preprocessing | 7:06 |
| | Deploying Code on Visual Studio to Docker Containers | 5:27 |
| **Azure Functions and Blob Storage** | Develop Azure Functions via Python and VS Code | 5:52 |
| | Deploy Azure Function to Azure Function App and Test It | 6:26 |
| | Integrate Azure Function with Blob Storage via Bindings | 4:58 |
| **Add Azure Function to Azure API Management (APIM)** | Expose Azure Function as a Backend | 7:05 |
| | Securely Store Secrets in Azure Key Vault | 4:41 |
| | Add Basic Authentication in API Management | 4:35 |
| | Test APIM and Imported Azure Function via Local Python Program | 2:34 |
| **Create and Combine Event Hubs, Azure Function, and Cosmos DB** | Create Event Hubs and Test Capture Events Feature | 6:59 |
| | Modify Existing Azure Function to Include Event Hubs Binding | 6:42 |
| **Write Tweets to Cosmos DB (Core SQL) from Event Hub** | Create a Cosmos DB (Core SQL) | 9:03 |
| | Create a New Azure Function that Writes Messages to Cosmos DB | 9:03 |
| **Connect Power BI Desktop to Your Cosmos DB** | Connect Power BI Desktop via Connector and Create a Dashboard | 6:32 |
---
##### GCP Project Introduction
The GCP project is designed for those who want to learn how to build, manage, and optimize data pipelines on Google Cloud Platform. This project focuses on building an end-to-end pipeline that extracts data from an external weather API, processes it through GCP's data tools, and visualizes the results using Looker Studio. This project offers practical, hands-on experience with tools like Cloud SQL, Compute Engine, Cloud Functions, Pub/Sub, and Looker Studio.
Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/data-engineering-on-gcp)
##### Detailed GCP Project Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| **Introduction** | Introduction | 1:13 |
| | GitHub & the Team | 1:30 |
| **Data & Goals** | Architecture of the Project | 3:19 |
| | Introduction to Weather API | 2:18 |
| | Setup Google Cloud Account | 2:12 |
| **Project Setup** | Creating the Project | 2:35 |
| | Enabling the Required APIs | 1:34 |
| | Configure Scheduling | 2:20 |
| **Pipeline Creation - Extract from API** | Setup VM for Database Interaction | 2:53 |
| | Setup MySQL Database | 2:16 |
| | Setup VM Client and Create Database | 2:46 |
| | Creating Pub/Sub Message Queue | 1:41 |
| | Create Cloud Function to Pull Data from API | 4:17 |
| | Explanation of Code to Pull from API | 4:20 |
| **Pipeline Creation - Write to Database** | Create Function to Write to Database | 7:47 |
| | Explanation of Code to Write Data to Database | 5:56 |
| | Testing the Function | 5:51 |
| | Create Function Write Data to DB - Pull | 3:53 |
| | Explanation Code Write Data to DB - Pull | 4:33 |
| **Visualization** | Setup Looker Studio and Create Bubble Chart | 2:20 |
| | Setup Looker Studio and Create Time Series Chart | 1:57 |
| | Pipeline Monitoring | 6:20 |
---
#### Week 8: Working with Snowflake
##### Description
Currently, Snowflake is the analytics store/data warehouse everybody is talking about. It is a 100% cloud-based platform that offers many advantages, including flexible data access and the ability to scale services as needed. Snowflake is widely used in the industry, and learning it will enhance your data engineering skill set.
This training covers everything from the basics of Snowflake and data warehousing to advanced integration and automation techniques. By the end, you will have the knowledge to prepare, integrate, manage data on Snowflake, and connect other systems to the platform.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/snowflake-for-data-engineers)
##### Course Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| | Snowflake Basics | 4:16 |
| | Data Warehousing Basics | 4:13 |
| | How Snowflake Fits into Data Platforms | 3:14 |
| **Setup** | Snowflake Account Setup | 4:24 |
| | Creating Your Warehouse & UI Overview | 4:15 |
| **Loading CSVs from Your PC** | Our Dataset & Goals | 3:01 |
| | Setup Snowflake Database | 10:29 |
| | Preparing the Upload File | 8:31 |
| | Using Internal Stages with SnowSQL | 12:37 |
| | Splitting a Data Table into Two Tables | 6:38 |
| **Visualizing Data** | Creating a Visualization Worksheet | 7:08 |
| | Creating a Dashboard | 5:23 |
| | Connect PowerBI to Snowflake | 6:03 |
| | Query Data with Python | 7:35 |
| **Automation** | Create Import Task | 9:18 |
| | Create Table Refresh Task | 3:40 |
| | Test Our Pipeline | 3:14 |
| **AWS S3 Integration** | Working with External Stages for AWS S3 | 10:20 |
| | Implementing Snowpipe with S3 | 6:19 |
---
#### Week 9: Transforming Data With dbt
##### Description
dbt is a SQL-first transformation workflow that simplifies the process of transforming, testing, and documenting data. It allows teams to work directly within the data warehouse, creating trusted datasets for reporting, machine learning, and operational workflows. This training is the perfect starting point to get hands-on experience with dbt Core, dbt Cloud, and Snowflake.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/dbt-for-data-engineers)
##### Course Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| **dbt Introduction & Setup** | Modern Data Experience | 5:42 |
| | Introduction to dbt | 4:38 |
| | Goals of this Course | 4:50 |
| | Snowflake Preparation | 7:29 |
| | Loading Data into Snowflake | 4:48 |
| | Setup dbt Core | 9:35 |
| | Preparing the GitHub Repository | 3:32 |
| **Working with dbt-Core** | dbt Models & Materialization Explained | 6:16 |
| | Creating Your First SQL Model | 5:48 |
| | Working with Custom Schemas | 5:28 |
| | Creating Your First Python Model | 4:35 |
| | dbt Sources | 1:55 |
| | Configuring Sources | 4:03 |
| | Working with Seed Files | 4:20 |
| **Tests in dbt** | Generic Tests | 3:19 |
| | Tests with Great Expectations | 3:25 |
| | Writing Custom Generic Tests | 2:49 |
| **Working with dbt-Cloud** | dbt Cloud Setup | 7:25 |
| | Creating dbt Jobs | 5:14 |
| | CI/CD Automation with dbt Cloud and GitHub | 10:52 |
| | Documentation in dbt | 7:38 |
---
#### Week 10: Pipeline Orchestration with Airflow
##### Description
Apache Airflow is a powerful, platform-independent workflow orchestration tool widely used in the data engineering world. It allows you to create and monitor both stream and batch pipeline processes with ease. Airflow supports integration with major platforms and tools such as AWS, Google Cloud, and many more.
Airflow not only helps in planning and organizing workflows but also offers robust monitoring features, allowing you to troubleshoot and maintain complex ETL pipelines effectively. As one of the most popular tools for workflow orchestration, mastering Airflow is highly valuable for data engineers.
Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/learn-apache-airflow)
##### Course Curriculum
| Module | Lesson | Duration |
|--------|--------|----------|
| **Airflow Workflow Orchestration** | Airflow Usage | 3:19 |
| **Airflow Fundamental Concepts** | Fundamental Concepts | 2:47 |
| | Airflow Architecture | 3:09 |
| | Example Pipelines | 4:49 |
| | Spotlight 3rd Party Operators | 2:17 |
| | Airflow XComs | 4:32 |
| **Hands-On Setup** | Project Setup | 1:43 |
| | Docker Setup Explained | 2:06 |
| | Docker Compose & Starting Containers | 4:23 |
| | Checking Services | 1:48 |
| | Setup WeatherAPI | 1:33 |
| | Setup Postgres DB | 1:58 |
| **Learn Creating DAGs** | Airflow Webinterface | 4:37 |
| | Creating DAG With Airflow 2.0 | 9:46 |
| | Running our DAG | 4:15 |
| | Creating DAG With TaskflowAPI | 6:59 |
| | Getting Data From the API With SimpleHTTPOperator | 3:38 |
| | Writing into Postgres | 4:12 |
| | Parallel Processing | 4:15 |
| **Recap** | Recap & Outlook | 4:38 |
---
#### What’s Next?
After completing this roadmap, you’ll have the confidence and skills to not just analyze data but to engineer and optimize it like a pro! Explore advanced topics, start contributing to projects, and showcase your new skills to potential employers.
### Roadmap for Data Scientists
#### 14-Week Data Engineering Roadmap for Data Scientists
#### From Notebooks to Production: Build, Deploy, and Scale Your ML Workflows
#### Start this roadmap at my Academy: [Start Today](https://learndataengineering.com/p/data-engineering-for-data-scientists)
---
#### Who Is This Roadmap For?
- Data Scientists who want to deploy and maintain ML models in production
- ML practitioners struggling with real-time data, CI/CD, and orchestration
- Data professionals looking to expand their engineering toolkit
- Anyone ready to go beyond notebooks and automate their ML workflows
---
#### What You’ll Achieve
This roadmap provides a step-by-step approach to gaining production-grade data engineering skills. You'll start with pipelines and containerization, move on to deployment and orchestration, and finish with big data and monitoring.

#### Learning Goals
| Goal # | Description |
| ------- | -------------------------------------------------- |
| Goal #1 | Build an End-to-End ML Pipeline on AWS |
| Goal #2 | Add CI/CD & Containerization to Your Platform |
| Goal #3 | Implement the Lakehouse Architecture in AWS or GCP |
| Goal #4 | Orchestrate Your Pipelines with Airflow |
| Goal #5 | Process Big Data with Apache Spark & Streaming |
| Goal #6 | Analyze Your ML Training Logs with Elasticsearch |
---
#### 14-Week Learning Roadmap
| Week | Topic |
| ---------- | -------------------------------------------- |
| Week 1 | Platform & Pipeline Design |
| Week 2 | Docker Fundamentals |
| Week 3 | Relational Data Modeling |
| Week 4 | Working & Designing APIs |
| Week 5 & 6 | ML & Containerization on AWS |
| Week 7 | ETL & CI/CD on AWS |
| Week 8 | Building a Lakehouse on AWS or GCP |
| Week 9 | Orchestrate with Airflow |
| Week 10 | Pre-Process Data with Apache Spark |
| Week 11-13 | Build a Streaming Pipeline (AWS, Azure, GCP) |
| Week 14 | Analyze Training Logs with Elasticsearch |
---
#### Week 1: Platform & Pipeline Design
##### Description
Data pipelines are the foundation of any data platform. In this 110-minute training, you'll learn about stream, batch, and ML pipelines. You'll also explore platform blueprints, architecture components, and Lambda architecture.
**Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/data-pipeline-design)**
##### Course Curriculum
| Lesson | Duration |
| ------------------------------------------------ | ----------- |
| Platform Blueprint & End to End Pipeline Example | 10:11 |
| Data Engineering Tools Guide | 2:44 |
| End to End Pipeline Example | 6:18 |
| Push Ingestion Pipelines | 3:42 |
| Pull Ingestion Pipelines | 3:34 |
| Batch Pipelines | 3:07 |
| Streaming Pipelines | 3:34 |
| Stream Analytics | 2:26 |
| Lambda Architecture | 4:02 |
| Visualization Pipelines | 3:47 |
| Visualization with Hive & Spark on Hadoop | 6:21 |
| Visualization Data via Spark Thrift Server | 3:27 |
| Platform Examples (AWS, Azure, GCP, Hadoop) | Slides Only |
---
#### Week 2: Docker Fundamentals
##### Description
Docker is the go-to container platform for engineers. This training covers key concepts, hands-on Docker usage, building and running containers, and how Docker fits into production workflows.
**Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/docker-fundamentals)**
##### Course Curriculum
| Lesson | Duration |
| ----------------------------------- | -------- |
| Docker vs Virtual Machines | 6:23 |
| Docker Terminology | 5:56 |
| Installing Docker Desktop | 4:09 |
| Pulling Images & Running Containers | 6:34 |
| CLI Cheat Sheet | 3:38 |
| Docker Compose Explained | 6:34 |
| Build & Run Hello World Image | 6:28 |
| Build Image with Dependencies | 5:05 |
| Using DockerHub | 4:24 |
| Image Layers | 7:55 |
| Deployment in Production | 5:47 |
| Security Best Practices | 4:09 |
| Managing Docker with Portainer | 4:04 |
---
#### Week 3: Relational Data Modeling
##### Description
Learn how to design efficient and scalable relational models. You'll go through conceptual to physical modeling and normalize your schema. You'll use MySQL and MySQL Workbench for hands-on practice.
**Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/relational-data-modeling)**
##### Course Curriculum
| Lesson | Duration |
| -------------------------------- | -------- |
| History of Relational Models | 3:16 |
| Installing MySQL & Workbench | 8:04 |
| Workbench Introduction | 4:36 |
| The Design Process Explained | 4:14 |
| Discover Entities | 10:24 |
| Discover Attributes | 13:09 |
| Normalize & Define Relationships | 11:19 |
| Identifying vs Non-identifying | 2:01 |
| Resolve Many-to-Many | 4:00 |
| Resolve One-to-Many | 2:34 |
| Resolve One-to-One | 1:45 |
| Create ER Diagram | 19:46 |
| Create Physical Data Model | 4:13 |
| Populate from XLS | 15:13 |
| Course Conclusion | 1:28 |
---
#### Week 4: Working & Designing APIs
##### Description
APIs are the backbone of modern data platforms. You'll learn how to build and test APIs using FastAPI, design schemas, and deploy them in Docker. Postman and Docker are used for testing and deployment.
**Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/apis-with-fastapi-course)**
##### Course Curriculum
| Lesson | Duration |
| ----------------------------- | -------- |
| What are APIs? | 8:29 |
| Hosting vs Using APIs | 4:08 |
| HTTP Methods & Media Types | 6:56 |
| Response Codes & Parameters | 9:40 |
| FastAPI Setup | 4:55 |
| POST, GET, PUT API Methods | 16:18 |
| Testing with Postman | 4:22 |
| Deploying FastAPI with Docker | 6:01 |
| API Security Best Practices | 3:48 |
---
#### Week 5 & 6: ML & Containerization on AWS
##### Description
This hands-on project teaches you how to build a real-time ML pipeline on AWS. You'll pull data from the Twitter API (or The Guardian API), apply sentiment analysis with NLTK in a Lambda function, store results in a Postgres database via RDS, and build a Streamlit dashboard. Finally, you’ll containerize and deploy the dashboard using AWS ECS and ECR.
**Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/ml-on-aws)**
##### Course Curriculum
| Lesson | Duration |
| -------------------------------------------------- | -------- |
| Introduction | 2:38 |
| Project Architecture Explained | 2:06 |
| RDS Setup | 2:37 |
| VPC Inbound Rules | 2:12 |
| PG Admin Installation & S3 Config | 4:05 |
| Lambda Intro & IAM Setup | 3:11 |
| Create Lambda Function | 1:24 |
| Lambda Code Explained | 8:22 |
| Insert Code Into Lambda | 0:56 |
| Add Layers from Klayers | 5:32 |
| Create Custom Layers | 4:40 |
| Test Lambda & Set Env Variables | 4:53 |
| Schedule Lambda with EventBridge | 3:15 |
| Setup Virtual Conda Environment | 4:07 |
| Install Dependencies with Poetry | 5:57 |
| Streamlit App Code Walkthrough | 7:52 |
| Setup ECR Container Registry | 1:52 |
| AWS CLI Install & Login | 5:19 |
| Dockerfile Build & Push | 2:52 |
| Create ECS Fargate Cluster | 1:34 |
| ECS Task Configuration & Deployment | 4:59 |
| Fixing ECS Task | 5:14 |
| Stop ECS Task | 0:59 |
| Project Conclusion | 5:06 |
---
#### Week 7: ETL & CI/CD on AWS
##### Description
In this project, you'll build a lightweight ETL job that pulls data from a public weather API and writes it into a time series database. You’ll dockerize the job, schedule it using AWS Lambda and EventBridge, and visualize the data using Grafana.
**Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/timeseries-etl-with-aws-tdengine-grafana)**
### Course Curriculum
| Lesson | Duration |
| -------------------------------------------- | -------- |
| Quick Note from Andreas | 0:43 |
| Project Introduction | 1:26 |
| Setup of the Project | 2:52 |
| Time Series Data Basics | 2:20 |
| Big Pros of Time Series Databases | 2:06 |
| About TDengine | 1:22 |
| Setup Weather API | 1:04 |
| Code Query API | 2:41 |
| TDengine Setup | 3:04 |
| Connect Python to TDengine | 1:50 |
| Lambda Docker Container & Push to ECR | 1:55 |
| AWS Setup | 1:36 |
| Create Lambda Function Using Docker Image | 1:04 |
| Schedule Function with EventBridge | 1:25 |
| CloudWatch Lambda Events | 0:27 |
| Grafana Setup | 3:01 |
---
#### Week 8: Building a Lakehouse on AWS or GCP
##### Description
This week, you’ll learn how to combine data lakes and warehouses into a Lakehouse architecture. You’ll implement a full data analytics stack using tools like S3, Athena, BigQuery, Glue, Quicksight, and Data Studio.
**Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/modern-data-warehouses)**
##### Course Curriculum
| Lesson | Duration |
| -------------------------------------------------------- | -------- |
| Introduction | 2:13 |
| Data Science Platform Overview | 4:10 |
| ETL & ELT in Warehouses | 6:22 |
| Data Lake & Warehouse Integration | 3:29 |
| GCP Pipelines Overview | 3:13 |
| Cloud Storage & BigQuery Hands-on | 8:35 |
| Create Dashboard in Data Studio | 7:33 |
| GCP Recap & AWS Goals | 2:12 |
| Upload Data to S3 | 2:12 |
| Athena Manual Table Configuration | 3:48 |
| Create Dashboard in Quicksight | 5:05 |
| Athena via Glue Catalog | 3:29 |
| Course Recap | 2:36 |
| BONUS: Redshift Spectrum with S3 | 2:57 |
---
#### Week 9: Orchestrate with Airflow
##### Description
This training will guide you through installing and running Apache Airflow in Docker, creating DAGs, using the Taskflow API, and monitoring workflow execution.
**Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/learn-apache-airflow)**
##### Course Curriculum
| Lesson | Duration |
| --------------------------------------------- | -------- |
| Introduction | 1:36 |
| Airflow Usage | 3:19 |
| Fundamental Concepts | 2:47 |
| Airflow Architecture | 3:09 |
| Example Pipelines | 4:49 |
| Spotlight on 3rd Party Operators | 2:17 |
| Airflow XComs | 4:32 |
| Project Setup | 1:43 |
| Docker Setup Explained | 2:06 |
| Docker Compose & Starting Containers | 4:23 |
| Checking Services | 1:48 |
| Weather API Setup | 1:33 |
| Postgres DB Setup | 1:58 |
| Airflow Web Interface | 4:37 |
| Create DAG with Airflow 2.0 | 9:46 |
| Run Your DAG | 4:15 |
| Create DAG with Taskflow API | 6:59 |
| Get Data via SimpleHTTP Operator | 3:38 |
| Write to Postgres | 4:12 |
| Parallel Processing | 4:15 |
| Recap & Outlook | 4:38 |
---
#### Week 10: Pre-Process Data with Apache Spark
##### Description
This training introduces Apache Spark fundamentals, showing you how to process large datasets using Spark DataFrames, RDDs, and SparkSQL inside Docker and Jupyter Notebooks.
**Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/learning-apache-spark-fundamentals)**
##### Course Curriculum
| Lesson | Duration |
| ------------------------------------- | -------- |
| Introduction & Contents | 3:30 |
| Vertical vs Horizontal Scaling | 3:55 |
| What Spark Is Good For | 4:45 |
| Driver, Context & Executors | 4:11 |
| Cluster Types | 1:59 |
| Client vs Cluster Deployment | 6:11 |
| Where to Run Spark | 3:38 |
| Tools in Spark Course | 2:35 |
| Dataset Overview | 4:11 |
| Docker Setup | 2:52 |
| Jupyter Notebook Setup & Run | 5:31 |
| RDDs | 3:57 |
| DataFrames | 1:40 |
| Transformations & Actions Overview | 2:59 |
| Transformations | 2:22 |
| Actions | 3:06 |
| JSON Transformations | 9:52 |
| Working with Schemas | 8:23 |
| Working with DataFrames | 10:09 |
| SparkSQL | 5:04 |
| Working with RDDs | 12:52 |
---
#### Week 11–13: Build a Streaming Pipeline on AWS, Azure, or GCP
##### Description
In this 3-week section, you'll complete an end-to-end streaming data project on the cloud platform of your choice: AWS, Azure, or GCP. Each project teaches you how to ingest real-time data, process it, store it, and create visualizations.
You only need to complete one of the following three options:
---
##### Option 1: Streaming Pipeline on AWS
##### Description
You'll use AWS services like API Gateway, Kinesis, DynamoDB, Redshift, Lambda, Glue, and Power BI to create a complete streaming solution. You'll work with e-commerce data and build multiple ingestion and batch pipelines.
**Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/data-engineering-on-aws)**
##### Course Curriculum
| Lesson | Duration |
| -------------------------------------------- | -------- |
| Data Engineering | 4:15 |
| Data Science Platform | 5:20 |
| Dataset Introduction | 3:16 |
| Relational Storage Possibilities | 3:46 |
| NoSQL Storage Possibilities | 6:28 |
| Platform Design & Pipeline Planning | 3:49 |
| Client to Visualization Design | 3:00 |
| Data Ingestion to Kinesis | 3:00 |
| Stream to S3 and DynamoDB | 5:28 |
| Visualization API & Redshift | 5:29 |
| AWS Setup & IAM | 4:06 |
| Create Lambda Functions | 2:33 |
| Configure Firehose & Debugging | 7:43 |
| Power BI Setup | 12:16 |
| Glue Crawlers and Jobs | 26:52 |
---
##### Option 2: Streaming Pipeline on Azure
##### Description
You’ll build a Twitter-like JSON stream pipeline using Azure Functions, Event Hub, Cosmos DB, and Power BI. You’ll learn how to set up API management, key vaults, and authentication.
**Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/build-streaming-data-pipelines-in-azure)**
#### Course Curriculum
| Lesson | Duration |
| ---------------------------------------------------- | -------- |
| Project Introduction | 2:43 |
| Local Preprocessing & Docker Setup | 7:06 |
| Develop & Deploy Azure Functions | 5:52 |
| Test Functions & Integrate with Blob Storage | 6:26 |
| Add Functions to Azure API Management (APIM) | 7:05 |
| Key Vault & Authentication | 4:41 |
| Create Event Hubs and Bindings | 6:59 |
| Write to Cosmos DB | 9:03 |
| Power BI Connection and Dashboard Creation | 6:32 |
---
##### Option 3: Streaming Pipeline on GCP
##### Description
This project shows how to extract weather data via API, stream it with Pub/Sub, write it into Cloud SQL, and visualize it with Looker Studio. You'll also learn function deployment and VM/database setup.
**Check out this project in my Academy: [Learn More](https://learndataengineering.com/p/data-engineering-on-gcp)**
##### Course Curriculum
| Lesson | Duration |
| --------------------------------------------------- | -------- |
| Introduction & Setup | 2:43 |
| Architecture & Weather API | 5:31 |
| Enable APIs & Configure Scheduling | 4:00 |
| Setup MySQL Database & Compute Engine | 4:40 |
| Create Cloud Functions for Data Ingestion | 8:37 |
| Use Pub/Sub for Messaging | 1:41 |
| Write Data to Cloud SQL | 13:43 |
| Test and Monitor Data Flow | 5:51 |
| Setup Looker Studio & Build Dashboards | 4:17 |
| Monitor Pipelines | 6:20 |
---
##### Week 14: Analyze Training Logs with Elasticsearch
##### Description
Wrap up your roadmap by learning how to monitor pipelines using Elasticsearch. You’ll deploy Elasticsearch with Docker, send logs from your training pipelines, and visualize them in Kibana dashboards.
**Check out this course in my Academy: [Learn More](https://learndataengineering.com/p/log-analysis-with-elasticsearch)**
##### Course Curriculum
| Lesson | Duration |
| ------------------------------------------------ | -------- |
| Course Introduction | 2:07 |
| Elasticsearch vs Relational Databases | 5:43 |
| ETL Log Analysis & Debugging | 3:54 |
| Streaming Log Analysis & Debugging | 2:48 |
| Solving Problems with Elasticsearch | 4:37 |
| ELK Stack Overview | 2:03 |
| Setup Limiting RAM & Environment Config | 4:26 |
| Running Elasticsearch | 4:07 |
| Elasticsearch APIs & Python Index Creation | 7:31 |
| Write Logs (JSON) to Elasticsearch | 4:46 |
| Create Kibana Visualizations & Dashboards | 9:27 |
| Search Logs in Elasticsearch | 4:57 |
| Course Recap | — |
---
#### What’s Next?
After 14 weeks, you’ll have built scalable, production-ready data pipelines and ML workflows. You can now explore more advanced projects, optimize performance, and contribute to production systems with confidence. Need help showcasing your skills or getting hired? Reach out to my coaching program!
### Roadmap for Software Engineers

if you're transitioning from a background in computer science or software engineering into data engineering, you're already equipped with a solid foundation. Your existing knowledge in coding, familiarity with SQL databases, understanding of computer networking, and experience with operating systems like Linux, provide you with a considerable advantage. These skills form the cornerstone of data engineering and can significantly streamline your learning curve as you embark on this new journey.
Here's a refined roadmap, incorporating your prior expertise, to help you excel in data engineering:
- **Deepen Your Python Skills:** Python is crucial in data engineering for processing and handling various data formats, such as APIs, CSV, and JSON. Given your coding background, focusing on Python for data engineering will enhance your ability to manipulate and process data effectively.
- **Master Docker:** Docker is essential for deploying code and managing containers, streamlining the software distribution process. Your understanding of operating systems and networking will make mastering Docker more intuitive, as you'll appreciate the importance of containerization in today's development and deployment workflows.
- **Platform and Pipeline Design:** Leverage your knowledge of computer networking and operating systems to grasp the architecture of data platforms. Understanding how to design data pipelines, including considerations for stream and batch processing, and emphasizing security, will be key. Your background will provide a solid foundation for understanding how different components integrate within a data platform.
- **Choosing the Right Data Stores:** Dive into the specifics of data stores, understanding the nuances between transactional and analytical databases, and when to use relational vs. NoSQL vs. document stores vs. time-series databases. Your experience with SQL databases will serve as a valuable baseline for exploring these various data storage options.
- **Explore Cloud Platforms:** Get hands-on with cloud services such as AWS, GCP, and Azure. Projects or courses that offer practical experience with these platforms will be invaluable. Your tasks might include building pipelines to process data from APIs, using message queues, or delving into data warehousing and lakes, capitalizing on your foundational skills.
- **Optional Deep Dives:** For those interested in advanced data processing, exploring technologies like Spark or Kafka for stream processing can be enriching. Additionally, learning how to build APIs and work with MongoDB for document storage can open new avenues, especially through practical projects.
- **Log Analysis and Data Observability:** Familiarize yourself with tools like Elasticsearch, Grafana, and InfluxDB to monitor and analyze your data pipelines effectively. This area leverages your comprehensive understanding of how systems communicate and operate, enhancing your ability to maintain and optimize data flows.
As you embark on this path, remember that your journey is unique. Your existing knowledge not only serves as a strong foundation but also as a catalyst for accelerating your growth in the realm of data engineering. Keep leveraging your strengths, explore areas of interest deeply, and continually adapt to the evolving landscape of data technology.
| Live Stream -> Data Engineering Roadmap for Computer Scientists / Developers
|------------------|
|In this live stream you'll find even more details how to read this roadmap for Data Scientists, why I chose these tools and why I think this is the right way to do it.
| [Watch on YouTube](https://youtube.com/live/0e4WfIUixRw)|
## Data Engineers Skills Matrix

If you're diving into the world of data engineering or looking to climb the ladder within this field, you're in for a treat with this enlightening YouTube video. Andreas kicks things off by introducing us to a very handy tool they've developed: the Data Engineering Skills Matrix. This isn't just any chart; it's a roadmap designed to navigate the complex landscape of data engineering roles, ranging from a Junior Data Engineer to the lofty heights of a Data Architect and Machine Learning Engineer.
| Live Stream -> Data Engineering Skills Matrix
|------------------|
|In this live stream you'll find even more details how to read this skills matrix for Data Engineers.
| [Watch on YouTube](https://youtube.com/live/5E0UiBy0Kwo)|
Andreas takes us through the intricacies of this matrix, layer by layer. Starting with the basics, they discuss the minimum experience needed for each role. It's an eye-opener, especially when you see how experience requirements evolve from a beginner to senior levels. But it's not just about how many years you've spent in the field; it's about the skills you've honed during that time.
### Challenges & Responsibilities
As the conversation progresses, Andreas delves into the core responsibilities and main tasks associated with each role. You'll learn what sets a Junior Data Engineer apart from a Senior Data Engineer, the unique challenges a Data Architect faces, and the critical skills a Machine Learning Engineer must possess. This part of the video is golden for anyone trying to understand where they fit in the data engineering ecosystem or plotting their next career move.
### SQL & Soft Skills
Then there's the talk on SQL knowledge and its relevance across different roles. This segment sheds light on how foundational SQL is, irrespective of your position. But it's not just about technical skills; the video also emphasizes soft skills, like leadership and collaboration, painting a holistic picture of what it takes to succeed in data engineering.
For those who love getting into the weeds, Andreas doesn't disappoint. They discuss software development skills, debugging, and even dive into how data engineers work with SQL and databases. This part is particularly insightful for understanding the technical depth required at various stages of your career.
### Q&A
And here's the cherry on top: Andreas encourages interaction, inviting viewers to share their experiences and questions. This makes the video not just a one-way learning experience but a dynamic conversation that enriches everyone involved.
### Summary
By the end of this video, you'll walk away with a clear understanding of the data engineering field's diverse roles. You'll know the skills needed to excel in each role and have a roadmap for your career progression. Whether you're a recent graduate looking to break into data engineering or a seasoned professional aiming for a senior position, Andreas's video is a must-watch. It's not just a lecture; it's a guide to navigating the exciting world of data engineering, tailored by someone who's taken the time to lay out the journey for you.
## How to Become a Senior Data Engineer
Becoming a senior data engineer is a goal many in the tech industry aspire to. It's a role that demands a deep understanding of data architecture, advanced programming skills, and the ability to lead and communicate effectively within an organization. In this live stream series, I dove into what it takes to climb the ladder to a senior data engineering position. Here are the key takeaways. You can find the links to the videos and the shown images below.
### Understanding the Role
The journey to becoming a senior data engineer starts with a clear understanding of what the role entails. Senior data engineers are responsible for designing, implementing, and maintaining an organization's data architecture. They ensure data accuracy, accessibility, and security, often taking the lead on complex projects that require advanced technical skills and strategic thinking.
### Key Skills and Knowledge Areas
Based on insights from the live stream and consultations with industry experts, including GPT-3, here are the critical areas where aspiring senior data engineers should focus their development:
- **Advanced Data Modeling and Architecture:** Mastery of data modeling techniques and architecture best practices is crucial. This includes understanding of dimensional and Data Vault modeling, as well as expertise in SQL and NoSQL databases.
- **Big Data Technologies:** Familiarity with distributed computing frameworks (like Apache Spark), streaming technologies (such as Apache Kafka), and cloud-based big data technologies is essential.
Advanced ETL Techniques: Skills in incremental loading, data merging, and transformation are vital for efficiently processing large datasets.
- **Data Warehousing and Data Lake Implementation:** Building and maintaining scalable and performant data warehouses and lakes are fundamental responsibilities.
- **Cloud Computing:** Proficiency in cloud services from AWS, Azure, or GCP, along with platforms like Snowflake and Databricks, is increasingly important.
- **Programming and Scripting:** Advanced coding skills in languages relevant to data engineering, such as Python, Scala, or Java, are non-negotiable.
- **Data Governance and Compliance:** Understanding data governance frameworks and compliance requirements is critical, especially in highly regulated industries.
- **Leadership and Communication:** Beyond technical skills, the ability to lead projects, communicate effectively with both technical and non-technical team members, and mentor junior engineers is what differentiates a senior engineer.
### Learning Pathways
Becoming a senior data engineer requires continuous learning and real-world experience. Here are a few steps to guide your journey:
- **Educational Foundation:** Start with a strong foundation in computer science or a related field. This can be through formal education or self-study courses.
- **Gain Practical Experience:** Apply your skills in real-world projects. This could be in a professional setting, contributions to open-source projects, or personal projects.
- **Specialize and Certify:** Consider specializing in areas particularly relevant to your interests or industry needs. Obtaining certifications in specific technologies or platforms can also bolster your credentials.
- **Develop Soft Skills:** Work on your communication, project management, and leadership skills. These are as critical as your technical abilities.
- **Seek Feedback and Mentorship:** Learn from the experiences of others. Seek out mentors who can provide guidance and feedback on your progress.
### Video 1
| Live Stream -> How to become a Senior Data Engineer - Part 1
|------------------|
| In this part one I talked about Data Modeling, Big Data, ETL, Data Warehousing & Data Lakes as well as Cloud computing
| [Watch on YouTube](https://youtube.com/live/M-6xkTCKQQc)|

### Video 2
| Live Stream -> How to become a Senior Data Engineer - Part 2
|------------------|
| In part two I talked about real time data processing, programming & scripting, data governance, compliance and data security
| [Watch on YouTube](https://youtube.com/live/po96pzpjxvA)|

### Video 3
| Live Stream -> How to become a Senior Data Engineer - Part 3
|------------------|
| In part 3 I focused on everything regarding Leadership and Communication: team management, project management, collaboration, problem solving, strategic thinking, communication and leadership
| [Watch on YouTube](https://youtube.com/live/DMumpzSyRjI)|

### Final Thoughts
The path to becoming a senior data engineer is both challenging and rewarding. It requires a blend of technical prowess, continuous learning, and the development of soft skills that enable you to lead and innovate. Whether you're just starting out or looking to advance your career, focusing on the key areas outlined above will set you on the right path.
================================================
FILE: sections/02-BasicSkills.md
================================================
Basic Computer Science Skills
=============================
## Contents
- [Learn to Code](02-BasicSkills.md#learn-to-code)
- [Get Familiar with Git](02-BasicSkills.md#get-familiar-with-git)
- [Agile Development](02-BasicSkills.md#agile-development)
- [Why Is Agile So Important?](02-BasicSkills.md#Why-is-agile-so-important)
- [Agile Rules I Learned Over the Years](02-BasicSkills.md#agile-rules-i-learned-over-the-years)
- [Agile Frameworks](02-BasicSkills.md#agile-frameworks)
- [Scrum](02-BasicSkills.md#scrum)
- [OKR](02-BasicSkills.md#okr)
- [Software Engineering Culture](02-BasicSkills.md#software-engineering-culture)
- [Learn How a Computer Works](02-BasicSkills.md#learn-how-a-computer-works)
- [Data Network Transmission](02-BasicSkills.md#data-network-transmission)
- [Security and Privacy](02-BasicSkills.md#security-and-privacy)
- [SSL Public and Private Key Certificates](02-BasicSkills.md#ssl-public-and-private-key-Certificates)
- [JSON Web Tokens](02-BasicSkills.md#json-web-tokens)
- [GDPR Regulations](02-BasicSkills.md#gdpr-regulations)
- [Linux](02-BasicSkills.md#linux)
- [OS Basics](02-BasicSkills.md#os-basics)
- [Shell Scripting](02-BasicSkills.md#shell-scripting)
- [Cron Jobs](02-BasicSkills.md#cron-jobs)
- [Packet Management](02-BasicSkills.md#packet-management)
- [Docker](02-BasicSkills.md#docker)
- [What is Docker and How it Works](02-BasicSkills.md#what-is-docker-and-what-do-you-use-it-for)
- [Kubernetes Container Deployment](02-BasicSkills.md#kubernetes-container-deployment)
- [Why and How To Do Docker Container Orchestration](02-BasicSkills.md#why-and-how-to-do-docker-container-orchestration)
- [Useful Docker Commands](02-BasicSkills.md#useful-docker-commands)
- [The Cloud](02-BasicSkills.md#the-cloud)
- [IaaS vs. PaaS vs. SaaS](02-BasicSkills.md#iaas-vs-paas-vs-saas)
- [AWS Azure IBM Google](02-BasicSkills.md#aws-azure-ibm-google)
- [Cloud vs. On-Premises](02-BasicSkills.md#cloud-vs-on-premises)
- [Security](02-BasicSkills.md#security)
- [Hybrid Clouds](02-BasicSkills.md#hybrid-clouds)
- [Data Scientists and Machine Learning](02-BasicSkills.md#Data-Scientists-and-Machine-Learning)
- [Machine Learning Workflow](02-BasicSkills.md#machine-learning-workflow)
- [Machine Learning Model and Data](02-BasicSkills.md#machine-learning-model-and-data)
Learn to Code
-------------
Why this is important: Without coding you cannot do much in data
engineering. I cannot count the number of times I needed a quick hack to solve a problem.
The possibilities are endless:
- Writing or quickly getting some data out of a SQL DB.
- Testing to produce messages to a Kafka topic.
- Understanding the source code of a Webservice
- Reading counter statistics out of a HBase key-value store.
So, which language do I recommend then?
If you would asked me a few years ago I would have said Java, 100%. Nowadays though the community moved heavily to Python. I highly recommend starting with it.
When you are getting into data processing with Spark you can use
Scala which is a JVM language, but Python is also very good here.
Python is a great choice. It is super versatile.
Where to Learn Python? There are free Python courses all over the internet.
- I have a beginner one in my Data Engineering academy: [Introduction to Python course](https://learndataengineering.com/p/introduction-to-python)
- I also have a Python for Data Engineers one one in my Data Engineering academy: [Python for Data Engineers course](https://learndataengineering.com/p/python-for-data-engineers)
Keep in mind to always keep it practical: Learning by doing!
I talked about the importance of learning by doing in this podcast:
Get Familiar with Git
---------------------
Why this is important: One of the major problems with coding is to keep
track of changes. It is also almost impossible to maintain a program you
have multiple versions of.
Another problem is the topic of collaboration and documentation, which
is super important.
Let's say you work on a Spark application and your colleagues need to
make changes while you are on holiday. Without some code management, they
are in huge trouble:
Where is the code? What have you changed last? Where is the
documentation? How do we mark what we have changed?
But, if you put your code on GitHub, your colleagues can find your code.
They can understand it through your documentation (please also have
in-line comments).
Developers can pull your code, make a new branch, and do the changes.
After your holiday, you can inspect what they have done and merge it with
your original code, and you end up having only one application.
Where to learn: Check out the GitHub Guides page where you can learn all
the basics:
This great GitHub commands cheat sheet saved my butt multiple times:
Also look into:
- Pull
- Push
- Branching
- Forking
GitHub uses markdown to write pages, a super simple language that is actually a lot of fun to write. Here's a markdown cheat cheatsheet:
Pandoc is a great tool to convert any text file to and from markdown:
Agile Development
-----------------
Agility is the ability to adapt quickly to changing circumstances.
These days, everyone wants to be agile. Big and small companies are
looking for the "startup mentality."
Many think it's the corporate culture. Others think it's the process of how
we create things that matters.
In this article, I am going to talk about agility and self-reliance,
about how you can incorporate agility in your professional career.
### Why Is Agile So Important?
Historically, development has been practiced as an explicitly defined process. You
think of something, specify it, have it developed, and then build in mass
production.
It's a bit of an arrogant process. You assume that you already know
exactly what a customer wants, or how a product has to look and how
everything works out.
The problem is that the world does not work this way!
Oftentimes the circumstances change because of internal factors.
Sometimes things just do not work out as planned or stuff is harder than
you think.
You need to adapt.
Other times you find out that you built something customers do not like
and needs to be changed.
You need to adapt.
That's why people jump on the Scrum train -- because Scrum is the
definition of agile development, right?
### Agile Rules I Learned Over the Years
#### Is the Method Making a Difference?
Yes, Scrum or Google's OKR can help to be more agile. The secret to
being agile, however, is not only how you create.
What makes me cringe is people trying to tell you that being agile
starts in your head. So, the problem is you?
No!
The biggest lesson I have learned over the past years is this: Agility
goes down the drain when you outsource work.
#### The Problem with Outsourcing
I know on paper outsourcing seems like a no-brainer: development costs
against the fixed costs.
It is expensive to bind existing resources on a task. It is even more
expensive if you need to hire new employees.
The problem with outsourcing is that you pay someone to build stuff for
you.
It does not matter who you pay to do something for you. He needs to make
money.
His agenda will be to spend as little time as possible on your work. That
is why outsourcing requires contracts, detailed specifications,
timetables, and delivery dates.
He doesn't want to spend additional time on a project, only because you
want changes in the middle. Every unplanned change costs him time and
therefore money.
If so, you need to make another detailed specification and a contract
change.
He is not going to put his mind into improving the product while
developing. Firstly, because he does not have the big picture. Secondly,
because he does not want to.
He is doing as he is told.
Who can blame him? If I were the subcontractor, I would do exactly the
same!
Does this sound agile to you?
#### Knowledge Is King: A lesson from Elon Musk
Doing everything in house -- that's why startups are so productive. No
time is wasted on waiting for someone else.
If something does not work or needs to be changed, there is someone on
the team who can do it right away.
One very prominent example who follows this strategy is Elon Musk.
Tesla's Gigafactories are designed to get raw materials in on one side
and spit out cars on the other. Why do you think Tesla is building
Gigafactories that cost a lot of money?
Why is SpaceX building its own space engines? Clearly, there are other,
older companies who could do that for them.
Why is Elon building tunnel boring machines at his new boring company?
At first glance, this makes no sense!
#### How You Really Can Be Agile
If you look closer, it all comes down to control and knowledge. You, your
team, your company, needs to do as much as possible on your own.
Self-reliance is king.
Build up your knowledge and therefore the team's knowledge. When you have
the ability to do everything yourself, you are in full control.
You can build electric cars, build rocket engines, or bore tunnels.
Don't largely rely on others, and be confident to just do stuff on your
own.
Dream big, and JUST DO IT!
PS. Don't get me wrong. You can still outsource work. Just do it in a
smart way by outsourcing small independent parts.
### Agile Frameworks
#### Scrum
There's an interesting Medium article with a lot of details
about Scrum:
Also, this Scrum guide webpage has good info:
#### OKR
I personally love OKR and have been using it for years. Especially for smaller
teams, OKR is great. You don't have a lot of overhead and get work done.
It helps you stay focused and look at the bigger picture.
I recommend doing a sync meeting every Monday. There you talk about what
happened last week and what you are going to work on this week.
I talked about this in this podcast:
There is also this awesome 1,5-hour startup guide from Google:
I really love this video; I rewatched it
multiple times.
### Software Engineering Culture
The software engineering and development culture is super important. How
does a company handle product development with hundreds of developers?
Check out this podcast:
| Podcast episode: #070 Engineering Culture At Spotify
|------------------
|In this podcast, we look at the engineering culture at Spotify, my favorite music streaming service. The process behind the development of Spotify is really awesome.
|[Watch on YouTube](https://youtu.be/1asVrsUDbp0) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/070-The-Engineering-Culture-At-Spotify-e45ipa)|
**Some interesting slides:**
Learn How a Computer Works
--------------------------
### CPU,RAM,GPU,HDD
### Differences Between PCs and Servers
I talked about computer hardware and GPU processing in this podcast:
Data Network Transmission
---------------------------------------
### OSI Model
The OSI Model describes how data flows through the network. It
consists of layers starting from physical layers, basically how the data
is transmitted over the line or optic fiber.
Check out this article for a deeper understanding of the layers and processes outlined in the OSI model:
The Wikipedia page is also very good:
###### Which Protocol Lives on Which Layer?
Check out this network protocol map. Unfortunately, it is really hard to
find it theses days:
### IP Subnetting
Check out this IP address and subnet guide from Cisco:
A calculator for subnets:
### Switch, Layer-3 Switch
For an introduction to how ethernet went from broadcasts, to bridges, to
Ethernet MAC switching, to ethernet & IP (layer 3) switching, to
software-defined networking, and to programmable data planes that can
switch on any packet field and perform complex packet processing, see
this video:
### Router
### Firewalls
I talked about network infrastructure and techniques in this podcast:
Security and Privacy
--------------------
### SSL Public and Private Key Certificates
### JSON Web Tokens
Link to the Wiki page:
### GDPR Regulations
The EU created the GDPR \"General Data Protection Regulation\" to
protect your personal data like: name, age, address, and so
on.
It's huge and quite complicated. If you want to do online business in
the EU, you need to apply these rules. The GDPR is applicable since May
25th, 2018. So, if you haven't looked into it, now is the time.
The penalties can be crazy high if you make mistakes here.
Check out the full GDPR regulation here:
By the way, if you do profiling or analyse big data in general, look
into it. There are some important regulations, unfortunately.
I spend months with GDPR compliance. Super fun. Not! Hahaha
### Privacy by Design
When should you look into privacy regulations and solutions?
Creating the product or service first and then bolting on the privacy is
a bad choice. The best way is to start implementing privacy right away
in the engineering phase.
This is called privacy by design. Privacy is an integral part of your
business, not just something optional.
Check out the Wikipedia page to get a feeling for the important
principles:
Linux
-----
Linux is very important to learn, at least the basics. Most big-data
tools or NoSQL databases run on Linux.
From time to time, you need to modify stuff through the operating system,
especially if you run an infrastructure as a service solution like
Cloudera CDH, Hortonworks, or a MapR Hadoop distribution.
### OS Basics
Show all historic commands:
history | grep docker
### Shell scripting
Ah, creating shell scripts in 2019? Believe it or not, scripting in the
command line is still important.
Start a process, automatically rename, move or do a quick compaction of
log files. It still makes a lot of sense.
Check out this cheat sheet to get started with scripting in Linux:
There's also this Medium article with a super-simple example for
beginners:
### Cron Jobs
Cron jobs are super important to automate simple processes or jobs in
Linux. You need this here and there, I promise. Check out these three
guides:
And, of course, Wikipedia, which is surprisingly good:
Pro tip: Don't forget to end your cron files with an empty line or a
comment, otherwise it will not work.
### Packet Management
Linux tips are the second part of this podcast:
Docker
------
### What is Docker, and What Do You Use It for?
Have you played around with Docker yet? If you're a data science learner
or a data scientist, you need to check it out!
It's awesome because it simplifies the way you can set up development
environments for data science. If you want to set up a dev environment,
you usually have to install a lot of packages and tools.
#### Don't Mess Up Your System
What this does is basically mess up your operating system. If you're
just starting out, you don't know which packages you need to install. You don't
know which tools you need to install.
If you want to, for instance, start with Jupyter Notebooks, you need to
install that on your PC somehow. Or, you need to start installing tools
like PyCharm or Anaconda.
All that gets added to your system, and so you mess up your system more
and more and more. What Docker brings you, especially if you're on a Mac
or a Linux system, is simplicity.
#### Preconfigured Images
Because it is so easy to install on those systems, another cool thing
about Docker images is you can just search them in the Docker store,
download them, and install them on your system.
Running them in a completely pre-configured environment, you don't need
to think about stuff. You go to the Docker library, and you search for Deep
Learning, GPU and Python.
You get a list of images you can download. You download one, start it
up, go to the browser and hit up the URL, and just start coding.
Start doing the work. The only other thing you need to do is bind some
drives to that instance so you can exchange files. And, then that's it!
There is no way that you can crash or mess up your system. It's all
encapsulated into Docker. Why this works is because Docker has native
access to your hardware.
#### Take It With You
It's not a completely virtualized environment like a VirtualBox. An
image has the upside that you can take it wherever you want. So, if
you're on your PC at home, use that there.
Make a quick build, take the image, and go somewhere else. Install the
image, which is usually quite fast, and just use it like you're at home.
It's that awesome!
### Kubernetes Container Deployment
I am getting into Docker a lot more myself. For a some different reasons.
What I'm looking for is using Docker with Kubernetes. With Kubernetes,
you can automate the whole container deployment process.
The idea is that you have a cluster of machines. Lets say you have
a 10-server cluster and you run Kubernetes on it.
Kubernetes lets you spin up Docker containers on demand to execute
tasks. You can set up how much resources like CPU, RAM, and network your
Docker container can use.
You can basically spin up containers, on the cluster on demand, whenever
you need to do an analytics task.
That's perfect for data science.
### How to Create, Start, Stop a Container
### Docker Micro-Services?
### Kubernetes
### Why and How to Do Docker Container Orchestration
Podcast about how data science learners use Docker (for data
scientists):
### Useful Docker Commands
Create a container:
docker run CONTAINER --network NETWORK
Start a stopped container:
docker start CONTAINER NAME
Stop a running container:
docker stop
List all running containers:
docker ps
List all containers including stopped ones:
docker ps -a
Inspect the container configuration (e.g. network settings, etc.):
docker inspect CONTAINER
List all available virtual networks:
docker network ls
Create a new network:
docker network create NETWORK --driver bridge
Connect a running container to a network:
docker network connect NETWORK CONTAINER
Disconnect a running container from a network:
docker network disconnect NETWORK CONTAINER
Remove a network:
docker network rm NETWORK
The Cloud
---------
### IaaS vs. PaaS vs. SaaS
Check out this podcast. It will help you understand the
difference and how to decide what to use.
| Podcast episode: #082 Reading Tweets With Apache Nifi & IaaS vs PaaS vs SaaS
|------------------|
|In this episode, we talk about the differences between infrastructure as a service, platform as a service, and application as a service. Then, we install the Nifi Docker container and look into how we can extract the twitter data.
| [Watch on YouTube](https://youtu.be/pWuT4UAocUY) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/082-Reading-Tweets-With-Apache-Nifi--IaaS-vs-PaaS-vs-SaaS-e45j50)|
### AWS, Azure, IBM, Google
Each of these have their own answer to IaaS, Paas, and SaaS. Pricing and
pricing models vary greatly between each provider. Likewise, each
provider's service may have limitations and strengths.
#### AWS
Here is the [full list of AWS services](https://www.amazonaws.cn/en/products/). Studying for the [AWS Certified Cloud Practitioner](https://aws.amazon.com/certification/certified-cloud-practitioner/?ch=cta&cta=header&p=2) and/or [AWS Certified Solutions Architect](https://aws.amazon.com/certification/certified-solutions-architect-associate/?ch=sec&sec=rmg&d=1) exams can be helpful to quickly gain an understanding of all these services.
Here are links for free digital training for the [AWS Certified Cloud Practitioner](https://explore.skillbuilder.aws/learn/public/learning_plan/view/82/cloud-foundations-learning-plan) and [AWS Certified Solutions Architect Associate](https://explore.skillbuilder.aws/learn/public/learning_plan/view/78/architect-learning-plan).
Here is a free 17 hour [Data Analytics Learning plan](https://explore.skillbuilder.aws/learn/public/learning_plan/view/97/data-analytics-learning-plan) for AWS's [Analytics](https://aws.amazon.com/big-data/datalakes-and-analytics/?nc2=h_ql_prod_an)/Data Engineering services.
#### Azure
[Full list of Azure services](https://azure.microsoft.com/en-us/services/).
[Get started with mini courses](https://docs.microsoft.com/en-us/learn/browse/).
#### IBM
#### Google
Google Cloud Platform offers a wide, ever-evolving variety of services.
[List of GCP services with brief description](https://github.com/gregsramblings/google-cloud-4-words). In
recent years, documentation and tutorials have com a long way to help
[getting started with
GCP](https://cloud.google.com/gcp/getting-started/). You can start with
a free account, but to use many of the services, you will need to turn on
billing. Once you do enable billing, always remember to turn off services
that you have spun up for learning purposes. It is also a good idea to
turn on billing limits and alerts.
### Cloud vs. On-Premises
| Podcast episode: #076 Cloud vs. On-Premise
|------------------|
|How to choose between cloud and on-premises, pros and cons and what you have to think about. There are good reasons to not go cloud. Also, thoughts on how to choose between the cloud providers by just comparing instance prices. Otherwise, the comparison will drive you insane. My suggestion: Basically use them as IaaS and something like Cloudera as PaaS. Then build your solution on top of that.
| [Watch on YouTube](https://youtu.be/BAzj0yGcrnE) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/076-Cloud-vs-On-Premise-How-To-Decide-e45ivk)|
### Security
Listen to a few thoughts about the cloud in this podcast:
### Hybrid Clouds
Hybrid clouds are a mixture of on-premises and cloud deployment. A very
interesting example for this is Google Anthos:
# Data Scientists and Machine Learning
Data scientists aren't like every other scientist.
Data scientists do not wear white coats or work in high tech labs full
of science fiction movie equipment. They work in offices just like you
and me.
What differs them from most of us is that they are math experts. They
use linear algebra and multivariable calculus to create new insight from
existing data.
How exactly does this insight look?
Here's an example:
An industrial company produces a lot of products that need to be tested
before shipping.
Usually such tests take a lot of time because there are hundreds of
things to be tested. All to make sure that your product is not broken.
Wouldn't it be great to know early if a test fails ten steps down the
line? If you knew that you could skip the other tests and just trash the
product or repair it.
That's exactly where a data scientist can help you, big-time. This field
is called predictive analytics and the technique of choice is machine
learning.
Machine what? Learning?
Yes, machine learning, it works like this:
You feed an algorithm with measurement data. It generates a model and
optimises it based on the data you fed it with. That model basically
represents a pattern of how your data is looking. You show that model
new data and the model will tell you if the data still represents the
data you have trained it with. This technique can also be used for
predicting machine failure in advance with machine learning. Of course
the whole process is not that simple.
The actual process of training and applying a model is not that hard. A
lot of work for the data scientist is to figure out how to pre-process
the data that gets fed to the algorithms.
In order to train an algorithm you need useful data. If you use any data
for the training the produced model will be very unreliable.
An unreliable model for predicting machine failure would tell you that
your machine is damaged even if it is not. Or even worse: It would tell
you the machine is ok even when there is a malfunction.
Model outputs are very abstract. You also need to post-process the model
outputs to receive the outputs you desire

## Machine Learning Workflow

Data Scientists and Data Engineers. How does that all fit together?
You have to look at the data science process. How stuff is created and how data
science is done. How machine learning is
done.
The machine learning process shows, that you start with a training phase. A phase where you are basically training the algorithms to create the right output.
In the learning phase you are having the input parameters. Basically the configuration of the model and you have the input data.
What you're doing is you are training the algorithm. While training the algorithm modifies the training
parameters. It also modifies the used data and then you are getting to an output.
Once you get an output you are evaluating. Is that output okay, or is that output not the desired output?
if the output is not what you were looking for? Then you are continuing with the training phase.
You're trying to retrain the model hundreds, thousands, hundred thousands of times. Of course all this is being done automatically.
Once you are satisfied with the output, you are putting the model into production. In production it is no longer fed with training
data it's fed with the live data.
It's evaluating the input data live and putting out live results.
So, you went from training to production and then what?
What you do is monitoring the output. If the output keeps making sense, all good!
If the output of the model changes and it's on longer what you have expected, it means the model doesn't work anymore.
You need to trigger a retraining of the model. It basically gets to getting trained again.
Once you are again satisfied with the output, you put it into production again. It replaces the one in production.
This is the overall process how machine learning. It's how the learning part of data science is working.
## Machine Learning Model and Data

Now that's all very nice.
When you look at it, you have two very important places where you have data.
You have in the training phase two types of data:
Data that you use for the training. Data that basically configures the model, the hyper parameter configuration.
Once you're in production you have the live data that is streaming in. Data that is coming in from from an app, from
a IoT device, logs, or whatever.
A data catalog is also important. It explains which features are available and how different data sets are labeled.
All different types of data. Now, here comes the engineering part.
The Data Engineers part, is making this data available. Available to the data scientist and the machine learning process.
So when you look at the model, on the left side you have your hyper parameter configuration. You need to store and manage these configurations somehow.
Then you have the actual training data.
There's a lot going on with the training data:
Where does it come from? Who owns it? Which is basically data governance.
What's the lineage? Have you modified this data? What did you do, what was the basis, the raw data?
You need to access all this data somehow. In training and in production.
In production you need to have access to the live data.
All this is the data engineers job. Making the data available.
First an architect needs to build the platform. This can also be a good data engineer.
Then the data engineer needs to build the pipelines. How is the data coming in and how is the platform
connecting to other systems.
How is that data then put into the storage. Is there a pre processing for the algorithms necessary? He'll do it.
Once the data and the systems are available, it's time for the machine learning part.
It is ready for processing. Basically ready for the data scientist.
Once the analytics is done the data engineer needs to build pipelines to make it then accessible again. For instance for other analytics processes, for APIs, for front ends and so on.
All in all, the data engineer's part is a computer science part.
That's why I love it so much :)
================================================
FILE: sections/03-AdvancedSkills.md
================================================
Advanced Data Engineering Skills
================================
## Contents
- [Data Science Platform](03-AdvancedSkills.md#data-science-platform)
- [Why a Good Data Platform Is Important](03-AdvancedSkills.md#why-a-good-data-platform-is-important)
- [Big Data vs Data Science and Analytics](03-AdvancedSkills.md#Big-Data-vs-Data-Science-and-Analytics)
- [The 4 Vs of Big Data](03-AdvancedSkills.md#the-4-vs-of-big-data)
- [Why Big Data](03-AdvancedSkills.md#why-big-data)
- [Planning is Everything](03-AdvancedSkills.md#planning-is-everything)
- [The Problem with ETL](03-AdvancedSkills.md#the-problem-with-etl)
- [Scaling Up](03-AdvancedSkills.md#scaling-up)
- [Scaling Out](03-AdvancedSkills.md#scaling-out)
- [When not to Do Big Data](03-AdvancedSkills.md#please-dont-go-big-data)
- [81 Platform & Pipeline Design Questions](03-AdvancedSkills.md#81-platform-and-pipeline-design-questions)
- [Data Source Questions](03-AdvancedSkills.md#data-source-questions)
- [Goals and Destination Questions](03-AdvancedSkills.md#goals-and-destination-questions)
- [Connect](03-AdvancedSkills.md#connect)
- [REST APIs](03-AdvancedSkills.md#rest-apis)
- [API Design](03-AdvancedSkills.md#api-design)
- [Implementation Frameworks](03-AdvancedSkills.md#implementation-frameworks)
- [Security](03-AdvancedSkills.md#security)
- [Apache Nifi](03-AdvancedSkills.md#apache-nifi)
- [Logstash](03-AdvancedSkills.md#logstash)
- [Buffer](03-AdvancedSkills.md#buffer)
- [Apache Kafka](03-AdvancedSkills.md#apache-kafka)
- [Why a Message Queue Tool?](03-AdvancedSkills.md#why-a-message-queue-tool)
- [Kafka Architecture](03-AdvancedSkills.md#kafka-architecture)
- [Kafka Topics](03-AdvancedSkills.md#what-are-topics)
- [Kafka and Zookeeper](03-AdvancedSkills.md#what-does-zookeeper-have-to-do-with-kafka)
- [How to Produce and Consume Messages](03-AdvancedSkills.md#how-to-produce-and-consume-messages)
- [Kafka Commands](03-AdvancedSkills.md#kafka-commands)
- [Apache Redis Pub-Sub](03-AdvancedSkills.md#redis-pub-sub)
- [AWS Kinesis](03-AdvancedSkills.md#apache-kafka)
- [Google Cloud PubSub](03-AdvancedSkills.md#google-cloud-pubsub)
- [Processing Frameworks](03-AdvancedSkills.md#processing-frameworks)
- [Lambda and Kappa Architecture](03-AdvancedSkills.md#lambda-and-kappa-architecture)
- [Batch Processing](03-AdvancedSkills.md#batch-processing)
- [Stream Processing](03-AdvancedSkills.md#stream-processing)
- [Three Methods of Streaming](03-AdvancedSkills.md#three-methods-of-streaming)
- [At Least Once](03-AdvancedSkills.md#at-least-once)
- [At Most Once](03-AdvancedSkills.md#at-most-once)
- [Exactly Once](03-AdvancedSkills.md#exactly-once)
- [Check The Tools](03-AdvancedSkills.md#check-the-tools)
- [Should You do Stream or Batch Processing](03-AdvancedSkills.md#should-you-do-stream-or-batch-processing)
- [Is ETL still relevant for Analytics?](03-AdvancedSkills.md#is-etl-still-relevant-for-analytics)
- [MapReduce](03-AdvancedSkills.md#mapreduce)
- [How Does MapReduce Work](03-AdvancedSkills.md#How-does-mapreduce-work)
- [MapReduce](03-AdvancedSkills.md#mapreduce)
- [MapReduce Example](03-AdvancedSkills.md#example)
- [MapReduce Limitations](03-AdvancedSkills.md#What-is-the-limitation-of-mapreduce)
- [Apache Spark](03-AdvancedSkills.md#apache-spark)
- [What is the Difference to MapReduce?](03-AdvancedSkills.md#what-is-the-difference-to-MapReduce)
- [How Spark Fits to Hadoop](03-AdvancedSkills.md#how-does-spark-fit-to-hadoop)
- [Spark vs Hadoop](03-AdvancedSkills.md#wheres-the-difference)
- [Spark and Hadoop a Perfect Fit](03-AdvancedSkills.md#spark-and-hadoop-is-a-perfect-fit)
- [Spark on YARn](03-AdvancedSkills.md#spark-on-yarn)
- [My Simple Rule of Thumb](03-AdvancedSkills.md#my-simple-rule-of-thumb)
- [Available Languages](03-AdvancedSkills.md#available-languages)
- [Spark Driver Executor and SparkContext](03-AdvancedSkills.md#how-spark-works-driver-executor-sparkcontext)
- [Spark Batch vs Stream processing](03-AdvancedSkills.md#spark-batch-vs-stream-processing)
- [How Spark uses Data From Hadoop](03-AdvancedSkills.md#How-does-spark-use-data-from-hadoop)
- [What are RDDs and How to Use Them](03-AdvancedSkills.md#what-are-rdds-and-how-to-use-them)
- [SparkSQL How and Why to Use It](03-AdvancedSkills.md#available-languages)
- [What are Dataframes and How to Use Them](03-AdvancedSkills.md#what-are-dataframes-how-to-use-them)
- [Machine Learning on Spark (TensorFlow)](03-AdvancedSkills.md#machine-learning-on-spark-tensor-flow)
- [MLlib](03-AdvancedSkills.md#mllib)
- [Spark Setup](03-AdvancedSkills.md#spark-setup)
- [Spark Resource Management](03-AdvancedSkills.md#spark-resource-management)
- [AWS Lambda](03-AdvancedSkills.md#apache-flink)
- [Apache Flink](03-AdvancedSkills.md#apache-flink)
- [Elasticsearch](03-AdvancedSkills.md#elasticsearch)
- [Apache Drill](03-AdvancedSkills.md#apache-drill)
- [StreamSets](03-AdvancedSkills.md#streamsets)
- [Store](03-AdvancedSkills.md#store)
- [Analytical Data Stores](03-AdvancedSkills.md#analytical-data-stores)
- [Data Warehouse vs Data Lake](03-AdvancedSkills.md#data-warehouse-vs-data-lake)
- [Snowflake and dbt](03-AdvancedSkills.md#snowflake-and-dbt)
- [Transactional Data Stores](03-AdvancedSkills.md#transactional-data-stores)
- [SQL Databases](03-AdvancedSkills.md#sql-databases)
- [PostgreSQL DB](03-AdvancedSkills.md#postgresql-db)
- [Database Design](03-AdvancedSkills.md#database-design)
- [SQL Queries](03-AdvancedSkills.md#sql-queries)
- [Stored Procedures](03-AdvancedSkills.md#stored-procedures)
- [ODBC/JDBC Server Connections](03-AdvancedSkills.md#odbc-jdbc-server-connections)
- [NoSQL Stores](03-AdvancedSkills.md#nosql-stores)
- [HBase KeyValue Store](03-AdvancedSkills.md#keyvalue-stores-hbase)
- [HDFS Document Store](03-AdvancedSkills.md#document-stores-hdfs)
- [MongoDB Document Store](03-AdvancedSkills.md#document-stores-mongodb)
- [Elasticsearch Document Store](03-AdvancedSkills.md#Elasticsearch-search-engine-and-document-store)
- [Graph Databases (Neo4j)](03-AdvancedSkills.md#graph-db-neo4j)
- [Impala](03-AdvancedSkills.md#impala)
- [Kudu](03-AdvancedSkills.md#kudu)
- [Apache Druid](03-AdvancedSkills.md#apache-druid)
- [InfluxDB Time Series Database](03-AdvancedSkills.md#influxdb-time-series-database)
- [Greenplum MPP Database](03-AdvancedSkills.md#mpp-databases-greenplum)
- [NoSQL Data Warehouses](03-AdvancedSkills.md#nosql-data-warehouses)
- [Hive Warehouse](03-AdvancedSkills.md#hive-warehouse)
- [Impala](03-AdvancedSkills.md#impala)
- [Visualize](03-AdvancedSkills.md#visualize)
- [Android and IOS](03-AdvancedSkills.md#android-and-ios)
- [API Design for Mobile Apps](03-AdvancedSkills.md#how-to-design-apis-for-mobile-apps)
- [Dashboards](03-AdvancedSkills.md#dashboards)
- [Grafana](03-AdvancedSkills.md#grafana)
- [Kibana](03-AdvancedSkills.md#kibana)
- [Webservers](03-AdvancedSkills.md#how-to-use-webservers-to-display-content)
- [Tomcat](03-AdvancedSkills.md#tomcat)
- [Jetty](03-AdvancedSkills.md#jetty)
- [NodeRED](03-AdvancedSkills.md#nodered)
- [React](03-AdvancedSkills.md#react)
- [Business Intelligence Tools](03-AdvancedSkills.md#business-intelligence-tools)
- [Tableau](03-AdvancedSkills.md#tableau)
- [Power BI](03-AdvancedSkills.md#power-bi)
- [Quliksense](03-AdvancedSkills.md#quliksense)
- [Identity & Device Management](03-AdvancedSkills.md#Identity-and-device-management)
- [What Is A Digital Twin](03-AdvancedSkills.md#what-is-a-digital-twin)
- [Active Directory](03-AdvancedSkills.md#active-directory)
- [Machine Learning](03-AdvancedSkills.md#machine-learning)
- [How to do Machine Learning in production](03-AdvancedSkills.md#how-to-domachine-learning-in-production)
- [Why machine learning in production is harder then you think](03-AdvancedSkills.md#why-machine-learning-in-production-is-harder-then-you-think)
- [Models Do Not Work Forever](03-AdvancedSkills.md#models-do-not-work-forever)
- [Where are The Platforms That Support Machine Learning](03-AdvancedSkills.md#where-are-the-platforms-that-support-this)
- [Training Parameter Management](03-AdvancedSkills.md#training-parameter-management)
- [How to Convince People That Machine Learning Works](03-AdvancedSkills.md#how-to-convince-people-machine-learning-works)
- [No Rules No Physical Models](03-AdvancedSkills.md#no-rules-no-physical-models)
- [You Have The Data. Use It!](03-AdvancedSkills.md#you-have-the-data-use-it)
- [Data is Stronger Than Opinions](03-AdvancedSkills.md#data-is-stronger-than-opinions)
- [AWS Sagemaker](03-AdvancedSkills.md#aws-sagemaker)
## Data Science Platform
### Why a Good Data Platform Is Important
| Podcast Episode: #066 How To Do Data Science From A Data Engineers Perspective
|------------------|
|A simple introduction how to do data science in the context of the internet of things.
| [Watch on YouTube](https://youtu.be/yp_cc4R0mGQ) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/066-How-To-Do-Data-Science-From-A-Data-Engineers-Perspective-e45imt)|
### Big Data vs Data Science and Analytics
I talked about the difference in this podcast:
### The 4 Vs of Big Data
It is a complete misconception. Volume is only one part of the often
called four V's of big data: Volume, velocity, variety and veracity.
**Volume** is about the size - How much data you have
**Velocity** is about the speed - How fast data is getting to you
How much data in a specific time needs to get processed or is coming
into the system. This is where the whole concept of streaming data and
real-time processing comes in to play.
**Variety** is about the variety - How different your data is
Like CSV files, PDFs that you have and stuff in XML. That you also have
JSON logfiles, or data in some kind of a key-value store.
It's about the variety of data types from different sources that you
basically want to join together. All to make an analysis based on that
data.
**Veracity** is about the credibility - How reliable your data is
The issue with big data is, that it is very unreliable.
You cannot really trust the data. Especially when you're coming from the
Internet of Things (IoT) side. Devices use sensors for measurement of
temperature, pressure, acceleration and so on.
You cannot always be hundred percent sure that the actual measurement is
right.
When you have data that is from for instance SAP and it contains data
that is created by hand you also have problems. As you know we humans
are bad at inputting stuff.
Everybody articulates differently. We make mistakes, down to the spelling
and that can be a very difficult issue for analytics.
I talked about the 4Vs in this podcast:
### Why Big Data?
What I always emphasize is that the four V's are quite nice. They give you a
general direction.
There is a much more important issue: Catastrophic Success.
What I mean by catastrophic success is, that your project, your startup
or your platform has more growth that you anticipated. Exponential
growth is what everybody is looking for.
Because with exponential growth there is the money. It starts small and
gets very big very fast. The classic hockey stick curve:
1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384,
.... BOOM!
Think about it. It starts small and quite slow, but gets very big very
fast.
You get a lot of users or customers who are paying money to use your
service, the platform or whatever. If you have a system that is not
equipped to scale and process the data the whole system breaks down.
That's catastrophic success. You are so successful and grow so fast that
you cannot fulfill the demand anymore. And so you fail and it's all
over.
It's now like you just can make that up while you go. That you can
foresee in a few months or weeks the current system doesn't work
anymore.
### Planning is Everything
It's all happens very very fast and you cannot react anymore. There's a
necessary type of planning and analyzing the potential of your business
case necessary.
Then you need to decide if you actually have big data or not.
You need to decide if you use big data tools. This means when you
conceptualize the whole infrastructure it might look ridiculous to
actually focus on big data tools.
But in the long run it will help you a lot. Good planning will get a lot
of problems out of the way, especially if you think about streaming data
and real-time analytics.
### The problem with ETL
A typical old-school platform deployment would look like the picture
below. Devices use a data API to upload data that gets stored in a SQL
database. An external analytics tool is querying data and uploading the
results back to the SQL DB. Users then use the user interface to display
data stored in the database.

Now, when the front end queries data from the SQL database the following
three steps happen:
\- The database extracts all the needed rows from the storage. (E) - The
extracted data gets transformed, for instance sorted by timestamp or
something a lot more complex. (T) - The transformed data is loaded to
the destination (the user interface) for chart creation. (L)
With exploding amounts of stored data the ETL process starts being a
real problem.
Analytics is working with large data sets, for instance whole days,
weeks, months or more. Data sets are very big like 100GB or Terabytes.
That means Billions or Trillions of rows.
This has the result that the ETL process for large data sets takes
longer and longer. Very quickly the ETL performance gets so bad it won't
deliver results to analytics anymore.
A traditional solution to overcome these performance issues is trying to
increase the performance of the database server. That's what's called
scaling up.
### Scaling Up
To scale up the system and therefore increase ETL speeds administrators
resort to more powerful hardware by:
Speeding up the extract performance by adding faster disks to physically
read the data faster. Increasing RAM for row caching. What is already in
memory does not have to be read by slow disk drives. Using more powerful
CPU's for better transform performance (more RAM helps here as well).
Increasing or optimising networking performance for faster data delivery
to the front end and analytics.
In summary: Scaling up the system is fairly easy.

But with exponential growth it is obvious that sooner or later (more
sooner than later) you will run into the same problems again. At some
point you simply cannot scale up anymore because you already have a
monster system, or you cannot afford to buy more expensive hardware.
The next step you could take would be scaling out.
### Scaling Out
Scaling out is the opposite of scaling up. Instead of building bigger
systems the goal is to distribute the load between many smaller systems.
The easiest way of scaling out an SQL database is using a storage area
network (SAN) to store the data. You can then use up to eight SQL
servers (explain), attach them to the SAN and let them handle queries.
This way load gets distributed between those eight servers.

One major downside of this setup is that, because the storage is shared
between the SQL servers, it can only be used as an read only database.
Updates have to be done periodically, for instance once a day. To do
updates all SQL servers have to detach from the database. Then, one is
attaching the DB in read-write mode and refreshing the data. This
procedure can take a while if a lot of data needs to be uploaded.
This Link (missing) to a Microsoft MSDN page has more options of scaling
out an SQL database for you.
I deliberately don't want to get into details about possible scaling out
solutions. The point I am trying to make is that while it is possible to
scale out SQL databases it is very complicated.
There is no perfect solution. Every option has its up- and downsides.
One common major issue is the administrative effort that you need to
take to implement and maintain a scaled out solution.
### Please don't go Big Data
If you don't run into scaling issues please, do not use big data tools!
Big data is an expensive thing. A Hadoop cluster for instance needs at
least five servers to work properly. More is better.
Believe me this stuff costs a lot of money.
Especially when you are talking about maintenance and development on top
big data tools into account.
If you don't need it it's making absolutely no sense at all!
On the other side: If you really need big data tools they will save your
ass :)
## 81 Platform and Pipeline Design Questions
Many people ask: "How do you select the platform, tools and design the pipelines?"
The options seem infinite. Technology however should never dictate the decisions.
Here are 81 questions that you should answer when starting a project
### Data Source Questions
#### Data Origin and Structure
- **What is the source?** Understand the "device."
- **What is the format of the incoming data?** (e.g., JSON, CSV, Avro, Parquet)
- **What’s the schema?**
- **Is the data structured, semi-structured, or unstructured?**
- **What is the data type?** Understand the content of the data.
- **Is the schema well-defined, or is it dynamic?**
- **How are changes in the data structure from the source (schema evolution) handled?**
#### Data Volume & Velocity
- **How much data is transmitted per transmission?**
- **How fast is the data coming in?** (e.g., messages per minute)
- **What is the maximum data volume expected per source per day?**
- **What scaling of sources/data is expected?**
- **Are there peaks for incoming data?**
- **How much data is posted per day across all sources?**
- **How does the data volume fluctuate?** (e.g., seasonal peaks, hourly/daily variations)
- **How will the system handle bursts of data?** (e.g., throttling or buffering)
#### Source Reliability & Redundancy
- **Is there data arriving late?**
- **Is there a risk of duplicate data from the source?** How will we handle de-duplication?
- **How reliable are the sources?** What’s the expected failure rate?
- **How do we handle data corruption or loss during transmission?**
- **What happens if a source goes offline?** Is there a fallback or failover source?
- **Do we need to retry failed transmissions or have fault-tolerance mechanisms in place?**
#### Data Extraction & New Sources
- **Do we need to extract the data from the sources?**
- **How many sources are there?**
- **Will new sources be implemented?**
#### Data Source Connectivity & Authentication
- **How is the data arriving?** (API, bucket, etc.)
- **How is the authentication done?**
- **What kind of connection is required for the data source?** (e.g., streaming, batch, API)
- **What protocols are used for data ingestion?** (e.g., REST, WebSocket, FTP)
- **Are there any rate limits or quotas imposed by the data source?**
- **How do we handle credentials?** Is there an API?
- **What is the retry strategy if data fails to be processed or transmitted?**
#### Data Security & Compliance
- **Does the data need to be encrypted at the source before being transmitted?**
- **Are there any compliance frameworks (e.g., GDPR, HIPAA) that the source data must adhere to?**
- **Is there a requirement for data masking or obfuscation at the source?**
#### Metadata & Audit
- **Is there metadata for the client transmission stored somewhere?**
- **What metadata should be captured for each transmission?** (e.g., record counts, latency)
- **How do we track and log data ingestion events for audit purposes?**
- **Is there a need for tracking data lineage?** (i.e., source origin and changes over time)
---
### Goals and Destination Questions
#### Use Case & Data Consumption
- **What kind of use case is this?** (Analytics, BI, ML, Transactional processing, Visualization, User Interfaces, APIs)
- **What are the typical use cases that require this data?** (e.g., predictive analytics, operational dashboards)
- **What are the downstream systems or platforms that will consume this data?**
- **How critical is real-time data versus historical data in this use case?**
#### Data Query & Delivery
- **How is the data visualized?** (raw data, aggregated data)
- **How much raw data is processed at once?**
- **How much data is cold data, and how often is cold data queried?**
- **How fast do the results need to appear?**
- **How much data is going to be queried at once?**
- **How fresh does the data need to be?**
- **How often is the data queried?** (frequency)
- **What are the SLAs for delivering data to downstream systems or applications?**
#### Aggregation & Modeling
- **How is the data aggregated?** (by devices, topic, time)
- **When does the aggregation happen?** (on query, on schedule, while streaming)
- **What kind of data models are needed for this use case?** (e.g., star schema, snowflake schema)
- **Is there a need for pre-aggregations to speed up queries?**
- **Should partitioning or indexing strategies be implemented to optimize query performance?**
#### Performance & Availability
- **What is the processing time requirement?**
- **What is the availability of analytics output?** (input vs output delay)
- **How fresh does the data need to be?**
- **What are the performance expectations for query speed?**
- **What is the acceptable query response time for end-users?**
- **How will the system handle an increase in concurrent queries from multiple users?**
- **What is the expected lag between data ingestion and availability for querying?**
- **Do we need horizontal scaling for query engines or databases?**
#### Data Lifecycle & Retention
- **What’s the data retention time?**
- **How often is data archived or moved to lower-cost storage?**
- **Will old data need to be transformed or reprocessed for new use cases?**
- **What are the data retention policies?** (e.g., hot vs cold storage)
- **How will the use case evolve as the data grows?** Will this affect how data is consumed or visualized?
#### Monitoring & Debugging
- **How will data delivery to the destination be monitored?** (e.g., time-to-load, query failures)
- **How will we monitor data pipeline health at the destination?** (e.g., throughput, latency)
- **What tools or methods will be used for debugging data delivery failures or performance bottlenecks?**
- **What metrics should be tracked to ensure data pipeline health?** (e.g., latency, throughput)
- **How do we handle issues such as data corruption or incomplete data at the destination?**
#### Data Access & Permissions
- **Who is working with the platform, and who has access to query or visualize the data?**
- **Which tools are used to query the data?**
- **What kind of data export capabilities are required?** (e.g., CSV, API, direct database access)
- **Is role-based access control (RBAC) needed to segment data views for different users?**
- **How will access to sensitive data be managed?** (e.g., row-level security, encryption)
#### Scaling & Future Requirements
- **What are the scalability requirements for the data platform as data volume grows?**
- **How will future business goals or scalability needs affect the design of data aggregation and retention strategies?**
- **How will the system handle an increasing load as more users query data or as data volume grows?**
## Connect
### REST APIs
APIs or Application Programming Interfaces are the cornerstones of any
great data platform.
| Podcast Episode: #033 How APIs Rule The World
|------------------|
|Strong APIs make a good platform. In this episode I talk about why you need APIs and why Twitter is a great example. Especially JSON APIs are my personal favorite. Because JSON is also important in the Big Data world, for instance in log analytics. How? Check out this episode!
| [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/How-APIs-Rule-The-World--PoDS-033-e24ttq)|
#### API Design
In this podcast episode we look into the Twitter API. It's a great
example how to build an API
| Podcast Episode: #081 Twitter API Research Data Engineering Course Part 5
|------------------|
|In this episode we look into the Twitter API documentation, which I love by the way. How can we get old tweets for a certain hashtags and how to get current live tweets for these hashtags?
| [Watch on YouTube](https://youtu.be/UnAXKxeIlyg) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/081-How-to-get-tweets-from-the-Twitter-API-e45j32)|
#### Payload compression attacks
How to defend your Server with zip Bombs
https://www.sitepoint.com/how-to-defend-your-website-with-zip-bombs/
#### Implementation Frameworks
Jersey:
Tutorial – REST API design and implementation in Java with Jersey and Spring:
https://www.codepedia.org/ama/tutorial-rest-api-design-and-implementation-in-java-with-jersey-and-spring/
Swagger:
Jersey vs Swagger:
Spring Framework:
When to use Spring or Jersey:
#### OAuth security
### Apache Nifi
Nifi is one of these tools that I identify as high potential tools. It
allows you to create a data pipeline very easily.
Read data from a RestAPI and post it to Kafka? No problem Read data from
Kafka and put it into a database? No problem
It's super versatile and you can do everything on the UI.
I use it in Part 3 of this Document. Check it out.
Check out the Apache Nifi FAQ website. Also look into the documentation
to find all possible data sources and sinks of Nifi:
Here's a great blog about Nifi:
### Logstash
### FluentD
Data Collector
https://www.fluentd.org/
### Apache Flume
https://flume.apache.org/
### Sqoop
https://sqoop.apache.org/
### Azure IoTHub
https://azure.microsoft.com/en-us/services/iot-hub/
## Buffer
### Apache Kafka
#### Why a message queue tool?
#### Kafka architecture
#### What are topics
#### What does Zookeeper have to do with Kafka
#### How to produce and consume messages
My YouTube video how to set up Kafka at home:
My YouTube video how to write to Kafka:
#### KAFKA Commands
Start Zookeeper container for Kafka:
docker run -d --name zookeeper-server \
--network app-tier \
-e ALLOW_ANONYMOUS_LOGIN=yes \
bitnami/zookeeper:latest
Start Kafka container:
docker run -d --name kafka-server \
--network app-tier \
-e KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper-server:2181 \
-e ALLOW_PLAINTEXT_LISTENER=yes \
bitnami/kafka:latest
### Redis Pub-Sub
### AWS Kinesis
### Google Cloud PubSub
## Processing Frameworks
### Lambda and Kappa Architecture
| Podcast Episode: #077 Lambda Architecture and Kappa Architecture
|------------------|
|In this stream we talk about the lambda architecture with stream and batch processing as well as a alternative the Kappa Architecture that consists only of streaming. Also Data engineer vs data scientist and we discuss Andrew Ng’s AI Transformation Playbook.
| [Watch on YouTube](https://youtu.be/iUOQPyHN9-0) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/077-Lambda--Kappa-Architecture-e45j0r)|
### Batch Processing
Ask the big questions. Remember your last yearly tax statement?
You break out the folders. You run around the house searching for the
receipts.
All that fun stuff.
When you finally found everything you fill out the form and send it on
its way.
Doing the tax statement is a prime example of a batch process.
Data comes in and gets stored, analytics loads the data from storage and
creates an output (insight):

Batch processing is something you do either without a schedule or on a
schedule (tax statement). It is used to ask the big questions and gain
the insights by looking at the big picture.
To do so, batch processing jobs use large amounts of data. This data is
provided by storage systems like Hadoop HDFS.
They can store lots of data (petabytes) without a problem.
Results from batch jobs are very useful, but the execution time is high.
Because the amount of used data is high.
It can take minutes or sometimes hours until you get your results.
### Stream Processing
Gain instant insight into your data.
Streaming allows users to make quick decisions and take actions based on
"real-time" insight. Contrary to batch processing, streaming processes
data on the fly, as it comes in.
With streaming you don't have to wait minutes or hours to get results.
You gain instant insight into your data.
In the batch processing pipeline, the analytics was after the data
storage. It had access to all the available data.
Stream processing creates insight before the data storage. It has only
access to fragments of data as it comes in.
As a result the scope of the produced insight is also limited. Because
the big picture is missing.

Only with streaming analytics you are able to create advanced services
for the customer. Netflix for instance incorporated stream processing
into Chuckwa V2.0 and the new Keystone pipeline.
One example of advanced services through stream processing is the
Netflix "Trending Now" feature. Check out the Netflix case study.
#### Three methods of streaming
In stream processing sometimes it is ok to drop messages, other times it
is not. Sometimes it is fine to process a message multiple times, other
times that needs to be avoided like hell.
Today's topic are the different methods of streaming: At most once, at
least once and exactly once.
What this means and why it is so important to keep them in mind when
creating a solution. That is what you will find out in this article.
#### At Least Once
At least once, means a message gets processed in the system once or
multiple times. So with at least once it's not possible that a message
gets into the system and is not getting processed.
It's not getting dropped or lost somewhere in the system.
One example where at least once processing can be used is when you think
about a fleet management of cars. You get GPS data from cars and that
data is transmitted with a timestamp and the GPS coordinates.
It's important that you get the GPS data at least once, so you know
where the car is. If you're processing this data multiple times, it
always has the the timestamp with it.
Because of that it does not matter that it gets processed multiple
times, because of the timestamp. Or that it would be stored multiple
times, because it would just override the existing one.
#### At Most Once
The second streaming method is at most once. At most once means that
it's okay to drop some information, to drop some messages.
But it's important that a message is only processed once as a
maximum.
A example for this is event processing. Some event is happening and that
event is not important enough, so it can be dropped. It doesn't have any
consequences when it gets dropped.
But when that event happens it's important that it does not get
processed multiple times. Then it would look as if the event happened
five or six times instead of only one.
Think about engine misfires. If it happens once, no big deal. But if the
system tells you it happens a lot you will think you have a problem with
your engine.
#### Exactly Once
Another thing is exactly once, this means it's not okay to drop data,
it's not okay to lose data and it's also not okay to process data
multiple times.
An example for this is banking. When you think about credit card
transactions it's not okay to drop a transaction.
When dropped, your payment is not going through. It's also not okay to
have a transaction processed multiple times, because then you are paying
multiple times.
#### Check The Tools!
All of this sounds very simple and logical. What kind of processing is
done has to be a requirement for your use case.
It needs to be thought about in the design process, because not every
tool is supporting all three methods. Very often you need to code your
application very differently based on the streaming method.
Especially exactly once is very hard to do.
So, the tool of data processing needs to be chosen based on if you need
exactly once, at least once or if you need at most once.
### Should you do stream or batch processing?
It is a good idea to start with batch processing. Batch processing is
the foundation of every good big data platform.
A batch processing architecture is simple, and therefore quick to set
up. Platform simplicity means, it will also be relatively cheap to run.
A batch processing platform will enable you to quickly ask the big
questions. They will give you invaluable insight into your data and
customers.
When the time comes and you also need to do analytics on the fly, then
add a streaming pipeline to your batch processing big data platform.
### Is ETL still relevant for Analytics?
| Podcast Episode: #039 Is ETL Dead For Data Science & Big Data?
|------------------|
|Is ETL dead in Data Science and Big Data? In today’s podcast I share with you my views on your questions regarding ETL (extract, transform, load). Is ETL still practiced or did pre-processing & cleansing replace it. What would replace ETL in Data Engineering.
| [Watch on YouTube](https://youtu.be/leSOWPaNkl4) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/Is-ETL-Dead-For-Data-Science--Big-Data---PoDS-039-e2b604)|
### MapReduce
Since the early days of the Hadoop eco system, the MapReduce framework
is one of the main components of Hadoop alongside HDFS.
Google for instance used MapReduce to analyse stored HTML content of
websites through counting all the HTML tags and all the words and
combinations of them (for instance headlines). The output was then used
to create the page ranking for Google Search.
That was when everybody started to optimise his website for the google
search. Serious search engine optimisation was born. That was the year
2004.
How MapReduce is working is, that it processes data in two phases: The
map phase and the reduce phase.
In the map phase, the framework is reading data from HDFS. Each dataset
is called an input record.
Then there is the reduce phase. In the reduce phase, the actual
computation is done and the results are stored. The storage target can
either be a database or back HDFS or something else.
After all it's Java -- so you can implement what you like.
The magic of MapReduce is how the map and reduce phase are implemented
and how both phases are working together.
The map and reduce phases are parallelised. What that means is, that you
have multiple map phases (mappers) and reduce phases (reducers) that can
run in parallel on your cluster machines.
Here's an example how such a map and reduce process works with data:

#### How does MapReduce work
First of all, the whole map and reduce process relies heavily on using
key-value pairs. That's what the mappers are for.
In the map phase input data, for instance a file, gets loaded and
transformed into key-value pairs.
When each map phase is done it sends the created key-value pairs to the
reducers where they are getting sorted by key. This means, that an input
record for the reduce phase is a list of values from the mappers that
all have the same key.
Then the reduce phase is doing the computation of that key and its
values and outputting the results.
How many mappers and reducers can you use in parallel? The number of
parallel map and reduce processes depends on how many CPU cores you have
in your cluster. Every mapper and every reducer is using one core.
This means that the more CPU cores you actually have, the more mappers
you can use, the faster the extraction process can be done. The more
reducers you are using the faster the actual computation is being done.
To make this more clear, I have prepared an example:
#### Example
As I said before, MapReduce works in two stages, map and reduce. Often
these stages are explained with a word count task.
Personally, I hate this example because counting stuff is to trivial and
does not really show you what you can do with MapReduce. Therefore, we
are going to use a more real world use-case from the IoT world.
IoT applications create an enormous amount of data that has to be
processed. This data is generated by physical sensors who take
measurements, like room temperature at 8 o'clock.
Every measurement consists of a key (the timestamp when the measurement
has been taken) and a value (the actual value measured by the sensor).
Because you usually have more than one sensor on your machine, or
connected to your system, the key has to be a compound key. Compound
keys contain in addition to the measurement time information about the
source of the signal.
But, let's forget about compound keys for now. Today we have only one
sensor. Each measurement outputs key-value pairs like: Timestamp-Value.
The goal of this exercise is to create average daily values of that
sensor's data.
The image below shows how the map and reduce process works.
First, the map stage loads unsorted data (input records) from the source
(e.g. HDFS) by key and value (key:2016-05-01 01:02:03, value:1).
Then, because the goal is to get daily averages, the hour:minute:second
information is cut from the timestamp.
That is all that happens in the map phase, nothing more.
After all parallel map phases are done, each key-value pair gets sent to
the one reducer who is handling all the values for this particular key.
Every reducer input record then has a list of values and you can
calculate (1+5+9)/3, (2+6+7)/3 and (3+4+8)/3. That's all.

What do you think you need to do to generate minute averages?
Yes, you need to cut the key differently. You then would need to cut it
like this: "2016-05-01 01:02", keeping the hour and minute information
in the key.
What you can also see is, why map reduce is so great for doing parallel
work. In this case, the map stage could be done by nine mappers in
parallel because each map is independent from all the others.
The reduce stage could still be done by three tasks in parallel. One for
orange, one for blue and one for green.
That means, if your dataset would be 10 times as big and you'd have 10
times the machines, the time to do the calculation would be the same.
#### What is the limitation of MapReduce?
MapReduce is awesome for simpler analytics tasks, like counting stuff.
It just has one flaw: It has only two stages Map and Reduce.

First MapReduce loads the data from HDFS into the mapping function.
There you prepare the input data for the processing in the reducer.
After the reduce is finished the results get written to the data store.
The problem with MapReduce is that there is no simple way to chain
multiple map and reduce processes together. At the end of each reduce
process the data must be stored somewhere.
This fact makes it very hard to do complicated analytics processes. You
would need to chain MapReduce jobs together.
Chaining jobs with storing and loading intermediate results just makes
no sense.
Another issue with MapReduce is that it is not capable of streaming
analytics. Jobs take some time to spin up, do the analytics and shut
down. Basically Minutes of wait time are totally normal.
This is a big negative point in a more and more real time data
processing world.
### Apache Spark
I talked about the three methods of data streaming in this podcast:
#### What is the difference to MapReduce?
Spark is a complete in-memory framework. Data gets loaded from, for
instance HDFS, into the memory of workers.
There is no longer a fixed map and reduce stage. Your code can be as
complex as you want.
Once in memory, the input data and the intermediate results stay in
memory (until the job finishes). They do not get written to a drive like
with MapReduce.
This makes Spark the optimal choice for doing complex analytics. It
allows you for instance to do iterative processes. Modifying a dataset
multiple times in order to create an output is totally easy.
Streaming analytics capability is also what makes Spark so great. Spark
has natively the option to schedule a job to run every X seconds or X
milliseconds.
As a result, Spark can deliver you results from streaming data in "real
time".
#### How does Spark fit to Hadoop?
There are some very misleading articles out there titled \"Spark or
Hadoop\", \"Spark is better than Hadoop\" or even \"Spark is replacing
Hadoop\".
So, it's time to show you the differences between Spark and Hadoop.
After this you will know when and for what you should use Spark and
Hadoop.
You'll also understand why \"Hadoop or Spark\" is the totally wrong
question.
#### Where's the difference?
To make it clear how Hadoop differs from Spark I created this simple
feature table:

Hadoop is used to store data in the Hadoop Distributed File System
(HDFS). It can analyse the stored data with MapReduce and manage
resources with YARN.
However, Hadoop is more than just storage, analytics and resource
management. There's a whole eco system of tools around the Hadoop core.
I've written about its eco system in this article: [missing](missing)
What is Hadoop and why is it so freakishly popular. You should check it
out as well.
Compared to Hadoop, Spark is "just" an analytics framework. It has no
storage capability. Although it has a standalone resource management,
you usually don't use that feature.
#### Spark and Hadoop is a perfect fit
So, if Hadoop and Spark are not the same things, can they work together?
Absolutely! Here's how the first picture will look if you combine Hadoop
with Spark:
missing
As Storage you use HDFS. Analytics is done with Apache Spark and YARN is
taking care of the resource management.
Why does that work so well together?
From a platform architecture perspective, Hadoop and Spark are usually
managed on the same cluster. This means on each server where a HDFS data
node is running, a Spark worker thread runs as well.
In distributed processing, network transfer between machines is a large
bottle neck. Transferring data within a machine reduces this traffic
significantly.
Spark is able to determine on which data node the needed data is stored.
This allows a direct load of the data from the local storage into the
memory of the machine.
This reduces network traffic a lot.
#### Spark on YARN:
You need to make sure that your physical resources are distributed
perfectly between the services. This is especially the case when you run
Spark workers with other Hadoop services on the same machine.
It just would not make sense to have two resource managers managing the
same server's resources. Sooner or later they will get in each others
way.
That's why the Spark standalone resource manager is seldom used.
So, the question is not Spark or Hadoop. The question has to be: Should
you use Spark or MapReduce alongside Hadoop's HDFS and YARN.
#### My simple rule of thumb:
If you are doing simple batch jobs like counting values or doing
calculating averages: Go with MapReduce.
If you need more complex analytics like machine learning or fast stream
processing: Go with Apache Spark.
#### Available Languages
Spark jobs can be programmed in a variety of languages. That makes
creating analytic processes very user-friendly for data scientists.
Spark supports Python, Scala and Java. With the help of SparkR you can
even connect your R program to a Spark cluster.
If you are a data scientist who is very familiar with Python just use
Python, its great. If you know how to code Java I suggest you start
using Scala.
Spark jobs are easier to code in Scala than in Java. In Scala you can
use anonymous functions to do processing.
This results in less overhead, it is a much cleaner, simpler code.
With Java 8 simplified function calls were introduced with lambda
expressions. Still, a lot of people, including me prefer Scala over
Java.
#### How Spark works: Driver, Executor, Sparkcontext
| Podcast Episode: #100 Apache Spark Week Day 1
|------------------|
|Is ETL dead in Data Science and Big Data? In today’s podcast I share with you my views on your questions regarding ETL (extract, transform, load). Is ETL still practiced or did pre-processing & cleansing replace it. What would replace ETL in Data Engineering.
| [Watch on YouTube](https://youtu.be/qD6Wi2pfCx0)
#### Spark batch vs stream processing
#### How does Spark use data from Hadoop
Another thing is data locality. I always make the point, that processing
data locally where it is stored is the most efficient thing to do.
That's exactly what Spark is doing. You can and should run Spark workers
directly on the data nodes of your Hadoop cluster.
Spark can then natively identify on what data node the needed data is
stored. This enables Spark to use the worker running on the machine
where the data is stored to load the data into the memory.

The downside of this setup is that you need more expensive servers.
Because Spark processing needs stronger servers with more RAM and CPUs
than a "pure" Hadoop setup.
#### What are RDDs and how to use them
RDDs are the core part of Spark. I learned and used RDDs first. It felt
familiar coming from MapReduce. Nowadays you use Dataframes or Datasets.
I still find it valuable to learn how RDDs and therefore Spark works at
a lower level.
| Podcast Episode: #101 Apache Spark Week Day 2
|------------------|
|On day two of the Apache Spark week we take a look at major Apache Spark concepts: RDDs, transformations and actions, caching and broadcast variables.
| [Watch on YouTube](https://youtu.be/9I6mA2W6_HU)
#### How and why to use SparkSQL?
When you use Apache Zeppelin notebooks to learn Spark you automatically
come across SparkSQL. SparkSQL allows you to access Dataframes with SQL
like queries.
Especially when you work with notebooks it is very handy to create
charts from your data. You can learn from mistakes easier than just
deploying Spark applications.
| Podcast Episode: #102 Apache Spark Week Day 3
|------------------|
| We continue the Spark week, hands on. We do a full example from reading a csv, doing maps and flatmaps, to writing to disk. We also use SparkSQL to visualize the data.
| [Watch on YouTube](https://youtu.be/Fk-s8eKD4ZI)
#### What are DataFrames how to use them
As I said before. Dataframes are the successors to RDDs. It's the new
Spark API.
Dataframes are basically lake Tables in a SQL Database or like an Excel
sheet. This makes them very simple to use and manipulate with SparkSQL.
I highly recommend to go this route.
Processing with Dataframes is even faster then with RDDs, because it
uses optimization alogrithms for the data processing.
| Podcast Episode: #103 Apache Spark Week Day 4
|------------------|
|We look into Dataframes, Dataframes and Dataframes.
| [Watch on YouTube](https://youtu.be/9I6mA2W6_HU)
#### Machine Learning on Spark? (Tensor Flow)
Wouldn't it be great to use your deep learning TensorFlow applications
on Spark? Yes, it is already possible. Check out these Links:
Why do people integrate Spark with TensorFlow even if there is a
distributed TensorFlow framework?
TensorFlow On Spark: Scalable TensorFlow Learning on Spark Clusters:
Deep Learning with Apache Spark and TensorFlow:
#### MLlib:
The machine learning library MLlib is included in Spark so there is
often no need to import another library.
I have to admit because I am not a data scientist I am not an expert in
machine learning.
From what I have seen and read though the machine learning framework
MLlib is a nice treat for data scientists wanting to train and apply
models with Spark.
#### Spark Setup
From a solution architect's point of view Spark is a perfect fit for
Hadoop big data platforms. This has a lot to do with cluster deployment
and management.
Companies like Cloudera, MapR or Hortonworks include Spark into their
Hadoop distributions. Because of that, Spark can be deployed and managed
with the clusters Hadoop management web fronted.
This makes the process for deploying and configuring a Spark cluster
very quick and admin friendly.
#### Spark Resource Management
When running a computing framework you need resources to do computation:
CPU time, RAM, I/O and so on. Out of the box Spark can manage resources
with it's stand-alone resource manager.
If Spark is running in an Hadoop environment you don't have to use
Spark's own stand-alone resource manager. You can configure Spark to use
Hadoop's YARN resource management.
Why would you do that? It allows YARN to efficiently allocate resources
to your Hadoop and Spark processes.
Having a single resource manager instead of two independent ones makes
it a lot easier to configure the resource management.

### Samza
[Link to Apache Samza Homepage](http://samza.apache.org/)
### AWS Lambda
[Link to AWS Lambda Homepage](https://aws.amazon.com/lambda/)
### Apache Flink
[Link to Apache Flink Homepage](https://flink.apache.org/)
### Elasticsearch
[Link to Elatsicsearch Homepage](https://www.elastic.co/products/elastic-stack)
### Graph DB
Graph databases store data in terms of nodes and relationships.
Each node represents an entity (people, movies, things and other
data points) and a relationship represents how the nodes are related.
They are designed to store and treat the relationships with the same
importance of that of the data (or nodes in this case). This
relationship-first approach makes a lot of difference as the relationship
between data need not be inferred anymore with foreign and primary keys.
Graph databases are especially useful when applications require
navigating through multiple and multi-level relationships between
various data points.
#### Neo4j
Neo4j is currently the most popular graph database management system.
It is ACID compliant and provides its own implementation of a graph database.
In addition to nodes and relationships, neo4j has the following components
to enrich the data model with information.
• Labels. These are used to group nodes, and each node can be assigned
multiple labels. Labels are indexed to speed up finding nodes in a graph.
• Properties. These are attributes of both nodes and relationships.
Neo4j allows for storing data as key-value pairs, which means properties
can have any value (string, number, or boolean).
##### Advantages
• Neo4j is schema-free
• Highly available and provides transactional guarantees
• Cypher is a declarative query language which makes it very easy to navigate the graph
• Neo4j is fast and easily traversible because the data is connected and is very easy to query, retrieve and navigate the graph
• For the same reason as above, there are no joins in Neo4j
##### Disadvantages
• Neo4j is not the best for any kind of aggregations or sorting, in comparison with a relational database
• While doable, they are not the best to handle transactional data like accounting
• Sharding is currently not supported
##### Use Cases
https://neo4j.com/use-cases/
### Apache Solr
[Link to Solr Homepage](https://solr.apache.org)
### Apache Drill
[Link to Apache Drill Homepage](https://drill.apache.org)
### Apache Storm
https://storm.apache.org/
### StreamSets
## Store
### Analytical Data Stores
#### Data Warehouse vs Data Lake
| Podcast Episode: #055 Data Warehouse vs Data Lake
|------------------|
|On this podcast we are going to talk about data warehouses and data lakes? When do people use which? What are the pros and cons of both? Architecture examples for both Does it make sense to completely move to a data lake?
| [Watch on YouTube](https://youtu.be/8gNQTrUUwMk) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/055-Data-Warehouse-vs-Data-Lake-e45iem)|
#### Snowflake and dbt

In the rapidly evolving landscape of data engineering, staying ahead means continuously expanding your skill set with the latest tools and technologies. Among the myriad of options available, dbt (data build tool) and Snowflake have emerged as indispensable for modern data engineering workflows. Understanding and leveraging these tools can significantly enhance your ability to manage and transform data, making you a more effective and valuable data engineer. Let's dive into why dbt and Snowflake should be at the top of your learning list and explore how the "dbt for Data Engineers" and "Snowflake for Data Engineers" courses from the Learn Data Engineering Academy can help you achieve mastery in these tools.
##### The Power of Snowflake in Data Engineering
Snowflake has revolutionized the data warehousing space with its cloud-native architecture. It offers a scalable, flexible, and highly performant platform that simplifies data management and analytics. Here’s why Snowflake is a critical skill for data engineers:
1. **Cloud-Native Flexibility:** Snowflake’s architecture allows you to scale resources up or down based on your needs, ensuring optimal performance and cost-efficiency.
2. **Unified Data Platform:** It unifies data silos, enabling seamless data sharing and collaboration across the organization.
3. **Integration Capabilities:** Snowflake integrates with various data tools and platforms, enhancing its versatility in different data workflows.
4. **Advanced Analytics:** With its robust support for data querying, transformation, and integration, Snowflake is ideal for complex analytical workloads.
The "Snowflake for Data Engineers" course in my Learn Data Engineering Academy provides comprehensive training on Snowflake. From the basics of setting up your Snowflake environment to advanced data automation with Snowpipes, the course equips you with practical skills to leverage Snowflake effectively in your data projects.
Learn more about the course [here](https://learndataengineering.com/p/snowflake-for-data-engineers).

##### Why dbt is a Game-Changer for Data Engineers
dbt is a powerful transformation tool that allows data engineers to transform, test, and document data directly within their data warehouse using simple SQL. Unlike traditional ETL tools, dbt operates on the principle of ELT (Extract, Load, Transform), which aligns perfectly with modern cloud data warehousing paradigms. Here are a few reasons why dbt is a must-have skill for data engineers:
1. **SQL-First Approach:** dbt allows you to write transformations in SQL, the lingua franca of data manipulation, making it accessible to a broad range of data professionals.
2. **Collaboration:** Teams can collaborate seamlessly, creating trusted datasets for reporting, machine learning, and operational workflows.
3. **Ease of Use:** With dbt, you can transform, test, and document your data with ease, streamlining the data pipeline process.
4. **Integration:** dbt integrates effortlessly with your existing data warehouse, such as Snowflake, making it a versatile addition to your toolkit.
In my Learn Data Engineering Academy you find the perfect starting point for mastering dbt with the course "dbt for Data Engineers". The course covers everything from the basics of ELT processes to advanced features like continuous integration and deployment (CI/CD) pipelines. With hands-on training, you'll learn to create data pipelines, configure dbt materializations, test dbt models, and much more.
Learn more about the course [here](https://learndataengineering.com/p/dbt-for-data-engineers).

##### dbt and Snowflake: A Winning Combination
When used together, dbt and Snowflake offer a powerful combination for data engineering. Here’s why:
1. **Seamless Integration:** dbt’s SQL-first transformation capabilities integrate perfectly with Snowflake’s scalable data warehousing, creating a streamlined ELT workflow.
2. **Efficiency:** Together, they enhance the efficiency of data transformation and analytics, reducing the time and effort required to prepare data for analysis.
3. **Scalability:** The combined power of dbt’s model management and Snowflake’s dynamic scaling ensures that your data pipelines can handle large and complex datasets with ease.
4. **Collaboration and Documentation:** dbt’s ability to document and test data transformations directly within Snowflake ensures that data workflows are transparent, reliable, and collaborative.
Get right into it with our Academy!
By integrating Snowflake and dbt into your skill set, you position yourself at the forefront of data engineering innovation. These tools not only simplify and enhance your data workflows but also open up new possibilities for data transformation and analysis.
### Transactional Data Stores
#### SQL Databases
##### PostgreSQL DB
Homepage:
PostgreSQL vs MongoDB:
##### Database Design
##### SQL Queries
##### Stored Procedures
##### ODBC/JDBC Server Connections
#### NoSQL Stores
##### KeyValue Stores (HBase)
| Podcast Episode: #056 NoSQL Key Value Stores Explained with HBase
|------------------|
|What is the difference between SQL and NoSQL? In this episode I show you on the example of HBase how a key/value store works.
| [Watch on YouTube](https://youtu.be/67hIkbpzFc8) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/056-NoSQL-Key-Value-Stores-Explained-With-HBase-e45ifb)|
##### Document Store HDFS
The Hadoop distributed file system, or HDFS, allows you to store files
in Hadoop. The difference between HDFS and other file systems like NTFS
or EXT is that it is a distributed one.
What does that mean exactly?
A typical file system stores your data on the actual hard drive. It is
hardware dependent.
If you have two disks then you need to format every disk with its own
file system. They are completely separate.
You then decide on which disk you physically store your data.
HDFS works different to a typical file system. HDFS is hardware
independent.
Not only does it span over many disks in a server. It also spans over
many servers.
HDFS will automatically place your files somewhere in the Hadoop server
collective.
It will not only store your file, Hadoop will also replicate it two or
three times (you can define that). Replication means replicas of the
file will be distributed to different servers.

This gives you superior fault tolerance. If one server goes down, then
your data stays available on a different server.
Another great thing about HDFS is, that there is no limit how big the
files can be. You can have server log files that are terabytes big.
How can files get so big? HDFS allows you to append data to files.
Therefore, you can continuously dump data into a single file without
worries.
HDFS physically stores files different then a normal file system. It
splits the file into blocks.
These blocks are then distributed and replicated on the Hadoop cluster.
The splitting happens automatically.

In the configuration you can define how big the blocks should be. 128
megabyte or 1 gigabyte?
No problem at all.
This mechanic of splitting a large file in blocks and distributing them
over the servers is great for processing. See the MapReduce section for
an example.
##### Document Store MongoDB
| Podcast Episode: #093 What is MongoDB
|------------------|
|What is the difference between SQL and NoSQL? In this episode I show you on the example of HBase how a key/value store works.
| [Watch on YouTube](https://youtu.be/U05knQN29FA)
**Links:**
What is MongoDB:
Or directly from MongoDB.com:
Storage in BSON files:
Hello World in MongoDB:
Real-Time Analytics on MongoDB Data in Power BI:
Spark and MongoDB:
MongoDB vs Time Series Database:
Fun article titled why you should never use mongodb:
MongoDB vs Cassandra:
##### Elasticsearch Search Engine and Document Store
Elasticsearch is not a DB but firstly a search engine that indexes JSON
documents.
| Podcast Episode: #095 What is Elasticsearch & Why is It So Popular?
|------------------|
|Elasticsearch is a super popular tool for indexing and searching data. On this stream we check out how it works, architectures and what to use it for. There must be a reason why it is so popular.
| [Watch on YouTube](https://youtu.be/hNb5zB4OPXM)
Links:
Great example for architecture with Elasticsearch, Logstash and Kibana:\
Introduction to Elasticsearch in the documentation:\
Working with JSON documents:\
JSONs need to be flattened heres how to work with nested objects in the
JSON:\
Indexing basics:\
How to do searches with search API:\
General recommendations when working with Elasticsearch:\
JSON document example and intro to Kibana:\
How to connect Tableau to Elasticsearch:\
Benchmarks how fast Elasticsearch is:\
Elasticsearch vs MongoDB quick overview:\
Logstash overview (preprocesses data before insert into Elasticsearch)
X-Pack Security for Elasticsearch:\
Google Trends Grafana vs Kibana:\
##### Apache Impala
[Apache Impala Homepage](https://impala.apache.org/)
##### Kudu
##### Apache Druid
| Podcast Episode: Druid NoSQL DB and Analytics DB Introduction
|------------------|
|In this video I explain what Druid is and how it works. We look into the architecture of a Druid cluster and check out how Clients access the data.
|[Watch on YouTube](https://youtu.be/EiEIeBXSWjM)
##### InfluxDB Time Series Database
What is time-series data?
Key concepts:
InfluxDB and Spark Streaming
Building a Streaming application with spark, grafana, chronogram and
influx:
Performance Dashboard Spark and InfluxDB:
Other alternatives for time series databases are: DalmatinerDB,
QuestDB, Prometheus, Riak TS, OpenTSDB, KairosDB
##### MPP Databases (Greenplum)
##### Azure Cosmos DB
https://azure.microsoft.com/en-us/services/cosmos-db/
##### Azure Table-Storage
https://azure.microsoft.com/en-us/services/storage/tables/
#### NoSQL Data warehouse
##### Hive Warehouse
##### Impala
## Visualize
### Android & IOS
### How to design APIs for mobile apps
### How to use Webservers to display content
### Dashboards
#### Grafana
#### Kibana
#### Tomcat
#### Jetty
#### NodeRED
#### React
### Business Intelligence Tools
#### Tableau
#### PowerBI
#### Quliksense
### Identity & Device Management
#### What is a digital twin?
#### Active Directory
Machine Learning
----------------
| Podcast Episode: Machine Learning In Production
|------------------|
|Doing machine learning in production is very different than for proof of concepts or in education. One of the hardest parts is keeping models updated.
| [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/Machine-Learning-In-Production-e11bbk)
### How to do Machine Learning in production
Machine learning in production is using stream and batch processing. In
the batch processing layer you are creating the models, because you have
all the data available for training.
In the stream in processing layer you are using the created models, you
are applying them to new data.
The idea that you need to incorporate is that it is a constant cycle.
Training, applying, re-training, pushing into production and applying.
What you don't want to do is doing this manually. You need to figure out
a process of automatic retraining and automatic pushing to into
production of models.
In the retraining phase the system automatically evaluates the training.
If the model no longer fits it works as long as it needs to create a
good model.
After the evaluation of the model is complete and it's good, the model
gets pushed into production. Into the stream processing.
### Why machine learning in production is harder then you think
How to automate machine learning is something that drives me day in and
day out.
What you do in development or education is, that you create a model and
fit it to the data. Then that model is basically done forever.
Where I'm coming from, the IoT world, the problem is that machines are
very different. They behave very different and experience wear.
### Models Do Not Work Forever
Machines have certain processes that decrease the actual health of the
machine. Machine wear is a huge issue. Models that that are built on top
of a good machine don't work forever.
When the Machine wears out, the models need to be adjusted. They need to
be maintained, retrained.
### Where The Platforms That Support This?
Automatic re-training and re-deploying is a very big issue, a very big
problem for a lot of companies. Because most existing platforms don't
have this capability (I actually haven't seen one until now).
Look at AWS machine learning for instance. The process is: build, train,
tune deploy. Where's the loop of retraining?
You can create models and then use them in production. But this loop is
almost nowhere to be seen.
It is a very big issue that needs to be solved. If you want to do
machine learning in production you can start with manual interaction of
the training, but at some point you need to automate everything.
### Training Parameter Management
To train a model you are manipulating input parameters of the models.
Take deep learning for instance.
To train you are manipulating for instance:
\- How many layers do you use. - The depth of the layers, which means
how many neurons you have in a layer. - What activation function you
use, how long are you training and so on.
You also need to keep track of what data you used to train which model.
All those parameters need to be manipulated automatically, models
trained and tested.
To do all that, you basically need a database that keeps track of those
variables.
How to automate this, for me, is like the big secret. I am still working
on figuring it out.
### What's Your Solution?
Did you already have the problem of automatic re-training and deploying
of models as well?
Were you able to use a cloud platform like Google, AWS or Azure?
It would be really awesome if you share your experience :)
### How to convince people machine learning works
Many people still are not convinced that machine learning works
reliably. But they want analytics insight and most of the time machine
learning is the way to go.
This means, when you are working with customers you need to do a lot of
convincing. Especially if they are not into machine learning themselves.
But it's actually quite easy.
### No Rules, No Physical Models
Many people are still under the impression that analytics only works
when it's based on physics. When there are strict mathematical rules to
a problem.
Especially in engineering heavy countries like Germany this is the norm:
"Sere has to be a Rule for Everysing!" (imagine a German accent). When
you're engineering you are calculating stuff based on physics and not
based on data. If you are constructing an airplane wing, you better make
sure to use calculations so it doesn't fall off.
And that's totally fine.
Keep doing that!
Machine learning has been around for decades. It didn't quite work as
good as people hoped. We have to admit that. But there is this
preconception that it still doesn't work.
Which is not true: Machine learning works.
Somehow you need to convince people that it is a viable approach. That
learning from data to make predictions is working perfectly.
### You Have The Data. USE IT!
As a data scientist you have one ace up your sleeve, it's the obvious
one:
It's the data and it's statistics.
You can use that data and those statistics to counter peoples
preconceptions. It's very powerful if someone says: "This doesn't work"
You bring the data. You show the statistics and you show that it works
reliably.
A lot of discussions end there.
Data doesn't lie. You can't fight data. The data is always right.
### Data is Stronger Than Opinions
This is also why I believe that autonomous driving will come quicker
than many of us think. Because a lot of people say, they are not safe.
That you cannot rely on those cars.
The thing is: When you have the data you can do the statistics.
You can show people that autonomous driving really works reliably. You
will see, the question of \"Is this allowed or is this not allowed?\"
will be gone quicker than you think.
Because government agencies can start testing the algorithms based on
predefined scenarios. They can run benchmarks and score the cars
performance.
All those opinions, if it works, or if it doesn't work, they will be
gone.
The motor agency has the statistics. The stats show people how good cars
work.
Companies like Tesla, they have it very easy. Because the data is
already there.
**They just need to show us that the algorithms work. The end.**
### AWS Sagemaker
Train and apply models online with Sagemaker
Link to the OLX Slideshare with pros, cons and how to use Sagemaker:
================================================
FILE: sections/04-HandsOnCourse.md
================================================
Data Engineering Course: Building A Data Platform
=================================================
## Contents
- [GenAI Retrieval Augmented Generation with Ollama and Elasticsearch](04-HandsOnCourse.md#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch)
- [Free Data Engineering Course with AWS, TDengine, Docker and Grafana](04-HandsOnCourse.md#free-data-engineering-course-with-aws-tdengine-docker-and-grafana)
- [Monitor your data in dbt & detect quality issues with Elementary](04-HandsOnCourse.md#monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary)
- [Solving Engineers 4 Biggest Airflow Problems](04-HandsOnCourse.md#solving-engineers-4-biggest-airflow-problems)
- [The best alternative to Airlfow? Mage.ai](04-HandsOnCourse.md#the-best-alternative-to-airlfow?-mage.ai)
## GenAI Retrieval Augmented Generation with Ollama and Elasticsearch
- This how-to is based on this one from Elasticsearch: https://www.elastic.co/search-labs/blog/rag-with-llamaIndex-and-elasticsearch
- Instead of Elasticsearch cloud we're going to run everything locally
- The simplest way to get this done is to just clone this GitHub Repo for the code and docker setup
- I've tried this on a M1 Mac. Changes for Windows with WSL will come later.
- The biggest problems that I had were actually installing the dependencies rather than the code itself.
### Install Ollama
1. Download Ollama from here https://ollama.com/download/mac
2. Unzip, drag into applications and install
3. do `ollama run mistral` (It's going to download the Mistral 7b model, 4.1GB size)
4. Create a new folder in Documents "Elasticsearch-RAG"
5. Open that folder in VSCode
### Install Elasticsearch & Kibana (Docker)
1. Use the docker-compose file from this repo: https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/docker-compose.yml
2. Download Docker Desktop from here: https://www.docker.com/products/docker-desktop/
3. Install docker desktop and sign in in the app/create a user -> sends you to the browser
**For Windows Users**
Configure WSL2 to use max only 4GB of ram:
```
wsl --shutdown
notepad "$env:USERPROFILE/.wslconfig"
```
.wslconfig file:
```
[wsl2]
memory=4GB # Limits VM memory in WSL 2 up to 4GB
```
**Modify the Linux kernel map count in WSL**
Do this before the start because Elasticsearch requires a higher value to work
`sudo sysctl -w vm.max_map_count=262144`
4. go to the Elasticsearch-RAG folder and do `docker compose up`
5. make sure you have Elasticsearch 8.11 or later (we use 8.16 here in this project) if you want to use your own Elasticsearch image
6. if you get this error on a mac then just open the console in the docker app: *error getting credentials - err: exec: docker-credential-desktop: executable file not found in $PATH, out:*
7. Install xcode command line tools: `xcode-select --install`
8. make sure you're at python 3.8.1 or larger -> installed 3.13.0 from https://www.python.org/downloads/
### Setup the virtual Python environment
#### preparation on a Mac
##### install brew
which brew
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
export PATH="/opt/homebrew/bin:$PATH"
brew --version
brew install pyenv
brew install pyenv-virtualenv
##### install pyenv
```
brew install pyenv
brew install pyenv-virtualenv
```
Modify the path so that pyenv is in the path variable
`nano ~/.zshrc`
```
export PYENV_ROOT="$HOME/.pyenv"
export PATH="$PYENV_ROOT/bin:$PATH"
eval "$(pyenv init --path)"
eval "$(pyenv init -)"
eval "$(pyenv virtualenv-init -)"
```
install dependencies for building python versions
`brew install openssl readline sqlite3 xz zlib`
Reload to apply changes
`source ~/.zshrc`
install python
```
pyenv install 3.11.6
pyenv version
```
Set Python version system wide
`pyenv global 3.11.6`
```
pyenv virtualenv
pyenv activate
pyenv virtualenv-delete
```
#### Windows without pyenv
setup virtual python environment - go to the Elasticsearch-RAG folder and do
`python3 -m venv .elkrag`
enable the environment
`source .elkrag/bin/activate`
### Install required libraries (do one at a time so you see errors):
```
pip install llama-index (optional python3 -m pip install package name)
pip install llama-index-embeddings-ollama
pip install llama-index-llms-ollama
pip install llama-index-vector-stores-elasticsearch
pip install python-dotenv
```
### Write the data to Elasticsearch
1. create / copy in the index.py file
2. download the conversations.json file from the folder code examples/GenAI-RAG
3. if you get an error with the execution then check if pedantic version is <2.0 `pip show pydantic` if not do this: `pip install "pydantic<2.0`
4. run the program index.py: https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/index.py
### Check the data in Elasticsearch
1. go to kibana http://localhost:5601/app/management/data/index_management/indices and see the new index called calls
2. go to dev tools and try out this query `GET calls/_search?size=1 http://localhost:5601/app/dev_tools#/console/shell`
### Query data from elasticsearch and create an output with Mistral
1. if everything is good then run the query.py file https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/query.py
2. try a few queries :)
### Install libraries to extract text from pdfs
### Extract data from CV and put it into Elasticsearch
I created a CV with ChatGPT https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/Liam_McGivney_CV.pdf
Install the library to extract text from the pdf
`pip install PyMuPDF`
I had to Shift+Command+p then python clear workspace cache and reload window. Then it saw it :/
The file cvpipeline.py has the python code for the indexing. It's not working right now though!
https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py
I'll keep developing this and update it once it's working.
## Free Data Engineering Course with AWS TDengine Docker and Grafana
**Free hands-on course:** [Watch on YouTube](https://youtu.be/eoj-CnrR9jA)
In this detailed tutorial video, Andreas guides viewers through creating an end-to-end data pipeline using time series data. The project focuses on fetching weather data from a Weather API, processing it on AWS, storing it in TDengine (a time series database), and visualizing the data with Grafana. Here's a concise summary of what the video covers:
1. **Introduction and Setup:**
- The project is introduced along with a GitHub repository containing all necessary resources and a step-by-step guide.
- The pipeline architecture includes an IoT weather station, a Weather API, AWS for processing, TDengine for data storage, and Grafana for visualization.
2. **Project Components:**
- **Weather API:** Utilizes weatherapi.com to fetch weather data.
- **AWS Lambda:** Processes the data fetched from the Weather API.
- **TDengine:** Serves as the time series database to store processed data. It's highlighted for its performance and simplicity, especially for time series data.
- **Grafana:** Used for creating dashboards to visualize the time series data.
3. **Development and Deployment:**
- The local development environment setup includes Python, Docker, and VS Code.
- The tutorial covers the creation of a Docker image for the project and deploying it to AWS Elastic Container Registry (ECR).
- AWS Lambda is then configured to use the Docker image from ECR.
- AWS EventBridge is used to schedule the Lambda function to run at specified intervals.
4. **Time Series Data:**
- The importance of time series data and the benefits of using a time series database like TDengine over traditional relational databases are discussed.
- TDengine's features such as speed, scaling, data retention, and built-in functions for time series data are highlighted.
5. **Building the Pipeline:**
- Detailed instructions are provided for setting up each component of the pipeline:
- Fetching weather data from the Weather API.
- Processing and sending the data to TDengine using an AWS Lambda function.
- Visualizing the data with Grafana.
- Each step includes code snippets and configurations needed to implement the pipeline.
6. **Conclusion:**
- The video concludes with a demonstration of the completed pipeline, showing weather data visualizations in Grafana.
- Viewers are encouraged to replicate the project using the resources provided in the GitHub repository linked in the video description.
This video provides a comprehensive guide to building a data pipeline with a focus on time series data, demonstrating the integration of various technologies and platforms to achieve an end-to-end solution.
## Monitor your data in dbt and detect quality issues with Elementary
**Free hands-on tutorial:** [Watch on YouTube](https://youtu.be/6fnU91Q2gq0)
In this comprehensive tutorial, Andreas delves into the integration of dbt (data build tool) with Elementary to enhance data monitoring and quality detection within Snowflake databases. The tutorial is structured to guide viewers through a hands-on experience, starting with an introduction to a sample project setup and the common challenges faced in monitoring dbt jobs. It then transitions into how Elementary can be utilized to address these challenges effectively.
Key learning points and tutorial structure include:
1. **Introduction to the Sample Project:** Andreas showcases a project setup involving Snowflake as the data warehouse, dbt for data modeling and testing, and a visualization tool for data analysis. This setup serves as the basis for the tutorial.
2. **Challenges in Monitoring dbt Jobs:** Common issues in monitoring dbt jobs are discussed, highlighting the limitations of the dbt interface in providing comprehensive monitoring capabilities.
3. **Introduction to Elementary:** Elementary is introduced as a dbt-native data observability tool designed to enhance the monitoring and analysis of dbt jobs. It offers both open-source and cloud versions, with the tutorial focusing on the cloud version.
4. **Setup Requirements:** The tutorial covers the necessary setup on both the Snowflake and dbt sides, including schema creation, user and role configuration in Snowflake, and modifications to the dbt project for integrating with Elementary.
5. **Elementary's User Interface and Features:** A thorough walkthrough of Elementary's interface is provided, showcasing its dashboard, test results, model runs, data catalog, and data lineage features. The tool's ability to automatically run additional tests, like anomaly detection and schema change detection, is also highlighted.
6. **Advantages of Using Elementary:** The presenter outlines several benefits of using Elementary, such as easy implementation, native test integration, clean and straightforward UI, and enhanced privacy due to data being stored within the user's data warehouse.
7. **Potential Drawbacks:** Some potential drawbacks are discussed, including the additional load on dbt job execution due to more models being run and limitations in dashboard customization.
8. **Summary and Verdict:** The tutorial concludes with a summary of the key features and benefits of using Elementary with dbt, emphasizing its value in improving data quality monitoring and detection.
Overall, viewers are guided through setting up and utilizing Elementary for dbt data monitoring, gaining insights into its capabilities, setup process, and the practical benefits it offers for data quality assurance.
## Solving Engineers 4 Biggest Airflow Problems
**Free hands-on tutorial:** [Watch on YouTube](https://youtu.be/b9bMNEh8bes)
In this informative video, Andreas discusses the four major challenges engineers face when working with Apache Airflow and introduces Astronomer, a managed Airflow service that addresses these issues effectively. Astronomer is highlighted as a solution that simplifies Airflow deployment and management, making it easier for engineers to develop, deploy, and monitor their data pipelines. Here's a summary of the key points discussed for each challenge and how Astronomer provides solutions:
1. Managing Airflow Deployments:
- **Challenge:** Setting up and maintaining Airflow deployments is complex and time-consuming, involving configuring cloud instances, managing resources, scaling, and updating the Airflow system.
- **Solution with Astronomer:** Offers a straightforward deployment process where users can easily configure their deployments, choose cloud providers (GCP, AWS, Azure), and set up scaling with just a few clicks. Astronomer handles the complexity, making it easier to manage production and quality environments.
2. Development Environment and Deployment:
- **Challenge:** Local installation of Airflow is complicated due to its dependency on multiple Docker containers and the need for extensive configuration.
- **Solution with Astronomer:** Provides a CLI tool for setting up a local development environment with a single command, simplifying the process of developing, testing, and deploying pipelines. The Astronomer CLI also helps in initializing project templates and deploying Dags to the cloud effortlessly.
3. Source Code Management and CI/CD Pipelines:
- **Challenge:** Collaborative development and continuous integration/deployment (CI/CD) are essential but challenging to implement effectively with Airflow alone.
- **Solution with Astronomer:** Facilitates easy integration with GitHub for source code management and GitHub Actions for CI/CD. This allows automatic testing and deployment of pipeline code, ensuring a smooth workflow for teams working on pipeline development.
4. Observing Pipelines and Alarms:
- **Challenge:** Monitoring data pipelines and getting timely alerts when issues occur is crucial but often difficult to achieve.
- **Solution with Astronomer:** The Astronomer platform provides a user-friendly interface for monitoring pipeline status and performance. It also offers customizable alerts for failures or prolonged task durations, with notifications via email, PagerDuty, or Slack, ensuring immediate awareness and response to issues.
Overall, the video shows Astronomer as a powerful and user-friendly platform that addresses the common challenges of using Airflow, from deployment and development to collaboration, CI/CD, and monitoring. It suggests that Astronomer can significantly improve the experience of engineers working with Airflow, making it easier to manage, develop, and monitor data pipelines.
## The best alternative to Airlfow? Mage.ai
**Free hands-on tutorial:** [Watch on YouTube](https://youtu.be/3gXsFEC3aYA)
In this insightful video, Andreas introduces Mage, a promising alternative to Apache Airflow, focusing on its simplicity, user-friendliness, and scalability. The video provides a comprehensive walkthrough of Mage, highlighting its key features and advantages over Airflow. Here's a breakdown of what viewers can learn and expect from the video:
1. **Deployment Ease:** Mage offers a stark contrast to Airflow's complex setup process. It simplifies deployment to a single Docker image, making it straightforward to install and start on any machine, whether it's local or cloud-based on AWS, GCP, or Azure. This simplicity extends to scaling, which Mage handles horizontally, particularly beneficial in Kubernetes environments where performance scales with the number of pipelines.
2. **User Interface (UI):** Mage shines with its UI, presenting a dark mode interface that's not only visually appealing but also simplifies navigation and pipeline management. The UI facilitates easy access to pipelines, scheduling, and monitoring of pipeline runs, offering a more intuitive experience compared to Airflow.
3. **Pipeline Creation and Modification:** Mage streamlines the creation of ETL pipelines, allowing users to easily add data loaders, transformers, and exporters through its UI. It supports direct interaction with APIs for data loading and provides a visual representation of the data flow, enhancing the overall pipeline design experience.
4. **Data Visualization and Exploration:** Beyond simple pipeline creation, Mage enables in-depth data exploration within the UI. Users can generate various charts, such as histograms and bar charts, to analyze the data directly, a feature that greatly enhances the tool's utility.
5. **Testing and Scheduling:** Testing pipelines in Mage is straightforward, allowing for quick integration of tests to ensure data quality and pipeline reliability. Scheduling is also versatile, supporting standard time-based triggers, event-based triggers for real-time data ingestion, and API calls for on-demand pipeline execution.
6. **Support for Streaming and ELT Processes:** Mage is not limited to ETL workflows but also supports streaming and ELT processes. It integrates seamlessly with DBT models for in-warehouse transformations and Spark for big data processing, showcasing its versatility and scalability.
7. **Conclusion and Call to Action:** Andreas concludes by praising the direction in which the industry is moving, with tools like Mage simplifying data engineering processes. He encourages viewers to try Mage and engage with the content by liking, subscribing, and commenting on their current tools and the potential impact of Mage.
Overall, the video shows Mage as a highly user-friendly, scalable, and versatile tool for data pipeline creation and management, offering a compelling alternative to traditional tools like Airflow.
================================================
FILE: sections/05-CaseStudies.md
================================================
Case Studies
============
## Contents
- [Data Science @Airbnb](05-CaseStudies.md#data-science-at-Airbnb)
- [Data Science @Amazon](05-CaseStudies.md#data-science-at-Amazon)
- [Data Science @Baidu](05-CaseStudies.md#data-science-at-Baidu)
- [Data Science @Blackrock](05-CaseStudies.md#data-science-at-Blackrock)
- [Data Science @BMW](05-CaseStudies.md#data-science-at-BMW)
- [Data Science @Booking.com](05-CaseStudies.md#data-science-at-Booking.com)
- [Data Science @CERN](05-CaseStudies.md#data-science-at-CERN)
- [Data Science @Disney](05-CaseStudies.md#data-science-at-Disney)
- [Data Science @DLR](05-CaseStudies.md#data-science-at-DLR)
- [Data Science @Drivetribe](05-CaseStudies.md#data-science-at-Drivetribe)
- [Data Science @Dropbox](05-CaseStudies.md#data-science-at-Dropbox)
- [Data Science @Ebay](05-CaseStudies.md#data-science-at-Ebay)
- [Data Science @Expedia](05-CaseStudies.md#data-science-at-Expedia)
- [Data Science @Facebook](05-CaseStudies.md#data-science-at-Facebook)
- [Data Science @Google](05-CaseStudies.md#data-science-at-Google)
- [Data Science @Grammarly](05-CaseStudies.md#data-science-at-Grammarly)
- [Data Science @ING Fraud](05-CaseStudies.md#data-science-at-ING-Fraud)
- [Data Science @Instagram](05-CaseStudies.md#data-science-at-Instagram)
- [Data Science @LinkedIn](05-CaseStudies.md#data-science-at-LinkedIn)
- [Data Science @Lyft](05-CaseStudies.md#data-science-at-Lyft)
- [Data Science @NASA](05-CaseStudies.md#data-science-at-NASA)
- [Data Science @Netflix](05-CaseStudies.md#data-science-at-Netflix)
- [Data Science @OLX](05-CaseStudies.md#data-science-at-OLX)
- [Data Science @OTTO](05-CaseStudies.md#data-science-at-OTTO)
- [Data Science @Paypal](05-CaseStudies.md#data-science-at-Paypal)
- [Data Science @Pinterest](05-CaseStudies.md#data-science-at-Pinterest)
- [Data Science @Salesforce](05-CaseStudies.md#data-science-at-Salesforce)
- [Data Science @Siemens Mindsphere](05-CaseStudies.md#data-science-at-Siemens-Mindsphere)
- [Data Science @Slack](05-CaseStudies.md#data-science-at-Slack)
- [Data Science @Spotify](05-CaseStudies.md#data-science-at-Spotify)
- [Data Science @Symantec](05-CaseStudies.md#data-science-at-Symantec)
- [Data Science @Tinder](05-CaseStudies.md#data-science-at-Tinder)
- [Data Science @Twitter](05-CaseStudies.md#data-science-at-Twitter)
- [Data Science @Uber](05-CaseStudies.md#data-science-at-Uber)
- [Data Science @Upwork](05-CaseStudies.md#data-science-at-Upwork)
- [Data Science @Woot](05-CaseStudies.md#data-science-at-Woot)
- [Data Science @Zalando](05-CaseStudies.md#data-science-at-Zalando)
How I do Case Studies
---------------------
### Data Science at Airbnb
| Podcast Episode: #063 Data Engineering At Airbnb Case Study
|------------------|
|How Airbnb is doing data engineering? Let’s check it out.
| [Watch on YouTube](https://youtu.be/iokqkMfyIfo) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/063-Data-Engineering-At-Airbnb-Case-Study-e45il2)|
**Slides:**
Airbnb Engineering Blog:
Data Infrastructure:
Scaling the serving tier:
Druid Analytics:
Spark Streaming for logging events:
-Druid Wiki:
### Data Science at Amazon
### Data Science at Baidu
### Data Science at Blackrock
### Data Science at BMW
### Data Science at Booking.com
| Podcast Episode: #064 Data Engineering at Booking.com Case Study
|------------------|
|How Booking.com is doing data engineering? Let’s check it out.
| [Watch on YouTube](https://youtu.be/9GE3yiVo1FM) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/064-Data-Engineering-At-Booking-com-Case-Study-e45ilg)|
**Slides:**
Druid:
Kafka Architecture:
Confluent Platform:
### Data Science at CERN
| Podcast Episode: #065 Data Engineering At CERN Case Study
|------------------|
|How is CERN doing Data Engineering? They must get huge amounts of data from the Large Hadron Collider. Let’s check it out.
| [Watch on YouTube](https://youtu.be/LrhfzPsKaDE) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/065-Data-Engineering-At-CERN-Case-Study-e45ime)|
**Slides:**
### Data Science at Disney
### Data Science at DLR
### Data Science at Drivetribe
### Data Science at Dropbox
### Data Science at Ebay
### Data Science at Expedia
### Data Science at Facebook
### Data Science at Google
\
\
\
### Data Science at Grammarly
### Data Science at ING Fraud
### Data Science at Instagram
### Data Science at LinkedIn
| Podcast Episode: #073 Data Engineering At LinkedIn Case Study
|------------------|
|Let’s check out how LinkedIn is processing data :)
| [Watch on YouTube](https://youtu.be/wgfoE8Jbr_Q) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/073-Data-Engineering-At-LinkedIn-Case-Study-e45is6)|
**Slides:**
### Data Science at Lyft
### Data Science at NASA
| Podcast Episode: #067 Data Engineering At NASA Case Study
|------------------|
|A look into how NASA is doing data engineering.
| [Watch on YouTube](https://youtu.be/Pctn_1UoNjA) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/067-Data-Engineering-At-NASA-Case-Study-e45ina)|
**Slides:**
### Data Science at Netflix
| Podcast Episode: #062 Data Engineering At Netflix Case Study
|------------------|
|How Netflix is doing Data Engineering using their Keystone platform.
| [Watch on YouTube](https://youtu.be/YWPsYpjNKeM) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/062-Data-Engineering-At-Netflix-Case-Study-e45ikp)|
Netflix revolutionized how we watch movies and TV. Currently over 75
million users watch 125 million hours of Netflix content every day!
Netflix's revenue comes from a monthly subscription service. So, the
goal for Netflix is to keep you subscribed and to get new subscribers.
To achieve this, Netflix is licensing movies from studios as well as
creating its own original movies and TV series.
But offering new content is not everything. What is also very important
is, to keep you watching content that already exists.
To be able to recommend you content, Netflix is collecting data from
users. And it is collecting a lot.
Currently, Netflix analyses about 500 billion user events per day. That
results in a stunning 1.3 Petabytes every day.
All this data allows Netflix to build recommender systems for you. The
recommenders are showing you content that you might like, based on your
viewing habits, or what is currently trending.
###### The Netflix batch processing pipeline
When Netflix started out, they had a very simple batch processing system
architecture.
The key components were Chuckwa, a scalable data collection system,
Amazon S3 and Elastic MapReduce.
![Old Netflix Batch Processing Pipeline[]{label="fig:Bild1"}](/images/Netflix-Chuckwa-Pipeline.jpg){#fig:Bild1
width="90%"}
Chuckwa wrote incoming messages into Hadoop sequence files, stored in
Amazon S3. These files then could be analysed by Elastic MapReduce jobs.
Netflix batch processing pipeline Jobs were executed regularly on a
daily and hourly basis. As a result, Netflix could learn how people used
the services every hour or once a day.
###### Know what customers want:
Because you are looking at the big picture you can create new products.
Netflix uses insight from big data to create new TV shows and movies.
They created House of Cards based on data. There is a very interesting
TED talk about this you should watch:
[How to use data to make a hit TV show \| Sebastian
Wernicke](https://www.youtube.com/watch?v=vQILP19qABk)
Batch processing also helps Netflix to know the exact episode of a TV
show that gets you hooked. Not only globally but for every country where
Netflix is available.
Check out the article from TheVerge
They know exactly what show works in what country and what show does
not.
It helps them create shows that work in everywhere or select the shows
to license in different countries. Germany for instance does not have
the full library that Americans have :(
We have to put up with only a small portion of TV shows and movies. If
you have to select, why not select those that work best.
###### Batch processing is not enough
As a data platform for generating insight the Cuckwa pipeline was a good
start. It is very important to be able to create hourly and daily
aggregated views for user behavior.
To this day Netflix is still doing a lot of batch processing jobs.
The only problem is: With batch processing you are basically looking
into the past.
For Netflix, and data driven companies in general, looking into the past
is not enough. They want a live view of what is happening.
###### The trending now feature
One of the newer Netflix features is "Trending now". To the average user
it looks like that "Trending Now" means currently most watched.
This is what I get displayed as trending while I am writing this on a
Saturday morning at 8:00 in Germany. But it is so much more.
What is currently being watched is only a part of the data that is used
to generate "Trending Now".
![Netflix Trending Now Feature[]{label="fig:Bild1"}](/images/Netflix-Trending-Now-Screenshot.jpg){#fig:Bild1
width="90%"}
"Trending now" is created based on two types of data sources: Play
events and Impression events.
What messages those two types actually include is not really
communicated by Netflix. I did some research on the Netflix Techblog and
this is what I found out:
Play events include what title you have watched last, where you did stop
watching, where you used the 30s rewind and others. Impression events
are collected as you browse the Netflix Library like scroll up and down,
scroll left or right, click on a movie and so on.
Basically, play events log what you do while you are watching.
Impression events are capturing what you do on Netflix, while you are
not watching something.
###### Netflix real-time streaming architecture
Netflix uses three internet facing services to exchange data with the
client's browser or mobile app. These services are simple Apache Tomcat
based web services.
The service for receiving play events is called "Viewing History".
Impression events are collected with the "Beacon" service.
The "Recommender Service" makes recommendations based on trend data
available for clients.
Messages from the Beacon and Viewing History services are put into
Apache Kafka. It acts as a buffer between the data services and the
analytics.
Beacon and Viewing History publish messages to Kafka topics. The
analytics subscribes to the topics and gets the messages automatically
delivered in a first in first out fashion.
After the analytics the workflow is straight forward. The trending data
is stored in a Cassandra Key-Value store. The recommender service has
access to Cassandra and is making the data available to the Netflix
client.
![Netflix Streaming Pipeline[]{label="fig:Bild1"}](/images/Netflix-Streaming-Pipeline.jpg){#fig:Bild1
width="90%"}
The algorithms how the analytics system is processing all this data is
not known to the public. It is a trade secret of Netflix.
What is known, is the analytics tool they use. Back in Feb 2015 they
wrote in the tech blog that they use a custom made tool.
They also stated, that Netflix is going to replace the custom made
analytics tool with Apache Spark streaming in the future. My guess is,
that they did the switch to Spark some time ago, because their post is
more than a year old.
### Data Science at OLX
| Podcast Episode: #083 Data Engineering at OLX Case Study
|------------------|
|This podcast is a case study about OLX with Senior Data Scientist Alexey Grigorev as guest. It was super fun.
| [Watch on YouTube](https://youtu.be/H_uFNoCvykM) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/083-Data-Engineering-at-OLX-Case-Study-e45j5n)|
**Slides:**
### Data Science at OTTO
### Data Science at Paypal
### Data Science at Pinterest
| Podcast Episode: #069 Engineering Culture At Pinterest
|------------------|
|In this podcast we look into data platform and processing at Pinterest.
| [Watch on YouTube](https://youtu.be/cqWXGVoDX8Q) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/069-Data-Engineering-At-Pinterest-Case-Study-e45ioh)|
**Slides:**
### Data Science at Salesforce
### Data Science at Siemens Mindsphere
| Podcast Episode: #059 What Is The Siemens Mindsphere IoT Platform?
|------------------|
|The Internet of things is a huge deal. There are many platforms available. But, which one is actually good? Join me on a 50 minute dive into the Siemens Mindsphere online documentation. I have to say I was super unimpressed by what I found. Many limitations, unclear architecture and no pricing available? Not good!
| [Watch on YouTube](https://youtu.be/HEd5Tsuy5HE) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/059-A-Look-Into-The-Siemens-Mindsphere-IoT-Platform---059-e45ihn)|
### Data Science at Slack
### Data Science at Spotify
| Podcast Episode: #071 Data Engineering At Spotify Case Study
|------------------|
|In this episode we are looking at data engineering at Spotify, my favorite music streaming service. How do they process all that data?
| [Watch on YouTube](https://youtu.be/0WJZ5wtQRWI) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/071-Data-Engineering-At-Spotify-Case-Study-e45iq1)|
**Slides:**
### Data Science at Symantec
### Data Science at Tinder
### Data Science at Twitter
| Podcast Episode: #072 Data Engineering At Twitter Case Study
|------------------|
|How is Twitter doing data engineering? Oh man, they have a lot of cool things to share these tweets.
| [Watch on YouTube](https://youtu.be/UkqSR3IeLZ8) \ [Listen on Anchor](https://anchor.fm/andreaskayy/episodes/072-Data-Engineering-At-Twitter-Case-Study-e45iqq)|
**Slides:**
### Data Science at Uber
### Data Science at Upwork
### Data Science at Woot
### Data Science at Zalando
| Podcast Episode: #087 Data Engineering At Zalando Case Study Talk
|------------------|
|I had a great conversation about data engineering for online retailing with Michal Gancarski and Max Schultze. They showed Zalando’s data platform and how they build data pipelines. Super interesting especially for AWS users.
| [Watch on YouTube](https://youtu.be/IXOLsNA6Hm0)
Do me a favor and give these guys a follow on LinkedIn:
LinkedIn of Michal:
LinkedIn of Max:
Zalando has a tech blog with more infos and there is also a meetup in
Berlin:
Zalando Blog:
Next Zalando Data Engineering Meetup:
Interesting tools:
AWS CDK:
Delta Lake:
AWS Step Functions:
[https://aws.amazon.com/step-functions/ AWS State Language: https://states-language.net/spec.html](https://aws.amazon.com/step-functions/ AWS State Language: https://states-language.net/spec.html)
Youtube channel of the meetup:
[https://www.youtube.com/channel/UCxwul7aBm2LybbpKGbCOYNA/playlists talk at Spark+AI](https://www.youtube.com/channel/UCxwul7aBm2LybbpKGbCOYNA/playlists talk at Spark+AI)
Summit about Zalando's Processing Platform:
Talk at Strata London slides:
================================================
FILE: sections/06-BestPracticesCloud.md
================================================
Best Practices Cloud Platforms
=============================
This section is a collection of best practices on how you can arrange the tools together to a platform.
It's here especially to help you start your own project in the cloud on AWS, Azure and GCP.
Like the advanced skills section this section also follows my [My Data Science Platform Blueprint](sections/01-Introduction.md#my-big-data-platform-blueprint).
In the blueprint I divided the platform into sections: Connect, Buffer, Processing, Store and Visualize.
This order will help you learn how to connect the right tools together.
Take your time and research the tools and learn how they work.
Right now the Azure section has a lot of links to platform examples.
They are also useful for AWS and GCP, just try to change out the tools.
As always, I am going to add more stuff to this over time.
Have fun!
## Contents
- [Amazon Web Services (AWS)](06-BestPracticesCloud.md#aws)
- [Connect](06-BestPracticesCloud.md#Connect)
- [Buffer](06-BestPracticesCloud.md#Buffer)
- [Processing](06-BestPracticesCloud.md#Processing)
- [Store](06-BestPracticesCloud.md#Store)
- [Visualize](06-BestPracticesCloud.md#Visualize)
- [Containerization](06-BestPracticesCloud.md#Containerization)
- [Best Practices](06-BestPracticesCloud.md#Best-Practices)
- [More Details](06-BestPracticesCloud.md#More-Details)
- [Microsoft Azure](06-BestPracticesCloud.md#azure)
- [Connect](06-BestPracticesCloud.md#Connect-1)
- [Buffer](06-BestPracticesCloud.md#Buffer-1)
- [Processing](06-BestPracticesCloud.md#Processing-1)
- [Store](06-BestPracticesCloud.md#Store-1)
- [Visualize](06-BestPracticesCloud.md#Visualize-1)
- [Containerization](06-BestPracticesCloud.md#Containerization-1)
- [Best Practices](06-BestPracticesCloud.md#Best-Practices-1)
- [Google Cloud Platform (GCP)](06-BestPracticesCloud.md#gcp)
- [Connect](06-BestPracticesCloud.md#Connect-2)
- [Buffer](06-BestPracticesCloud.md#Buffer-2)
- [Processing](06-BestPracticesCloud.md#Processing-2)
- [Store](06-BestPracticesCloud.md#Store-2)
- [Visualize](06-BestPracticesCloud.md#Visualize-2)
- [Containerization](06-BestPracticesCloud.md#Containerization-2)
- [Best Practices](06-BestPracticesCloud.md#Best-Practices-2)
# AWS
## Connect
- Elastic Beanstalk (very old)
- SES Simple Email Service
- API Gateway
## Buffer
- Kinesis
- Kinesis Data Firehose
- Managed Streaming for Kafka (MSK)
- MQ
- Simple Queue Service (SQS)
- Simple Notification Service (SNS)
## Processing
- EC2
- Athena
- EMR
- Elasticsearch
- Kinesis Data Analytics
- Glue
- Step Functions
- Fargate
- Lambda
- SageMaker
## Store
- Simple Storage Service (S3)
- Redshift
- Aurora
- RDS
- DynamoDB
- ElastiCache
- Neptune Graph DB
- Timestream
- DocumentDB (MongoDB compatible)
## Visualize
- Quicksight
## Containerization
- Elastic Container Service (ECS)
- Elastic Container Registry (ECR)
- Elastic Kubernetes Service (EKS)
## Best Practices
Deploying a Spring Boot Application on AWS Using AWS Elastic Beanstalk:
[https://aws.amazon.com/de/blogs/devops/deploying-a-spring-boot-application-on-aws-using-aws-elastic-beanstalk/](https://aws.amazon.com/de/blogs/devops/deploying-a-spring-boot-application-on-aws-using-aws-elastic-beanstalk/)
How to deploy a Docker Container on AWS:
[https://aws.amazon.com/getting-started/hands-on/deploy-docker-containers/](https://aws.amazon.com/getting-started/hands-on/deploy-docker-containers/)
#### AWS platform architecture for GenAI

▶ [Click here to watch](https://youtu.be/2yX6G4ZURbc)
I recorded a reaction video to an AWS platform architecture for GenAI called Tailwinds. Presented by John from Innovative Solutions and Josh from AWS, it has two main flows: indexing and consumer.
Data enters through S3 buckets or an API gateway, processed by AWS Lambda or Glue, and stored in a vector or graph database, then indexed in OpenSearch. Applications like chatbots use an API gateway to trigger Lambda functions for data retrieval and processing. This flexible serverless setup supports various data formats and uses tools like SAM and Terraform.
Amazon Bedrock helps customers choose and evaluate models. The architecture is flexible but requires effort to create the necessary Lambda functions. Check out the video and share your thoughts!
▶ [Click here to watch](https://youtu.be/2yX6G4ZURbc)
#### Generative AI enabled job search engine

▶ [Click here to watch](https://youtu.be/dOWqasmqfHQ)
Hey everyone, I recorded a reaction video to an AWS platform architecture for a Gen AI job search engine. Presented by Andrea from AWS and Bill from Healthy Careers, this setup uses generative AI to enhance job searches for healthcare professionals.
The architecture uses Elastic Container Service (ECS) to handle user queries, processed by Claude II for prompt checks and geolocation. Cleaned prompts are vectorized using Amazon's Titan model, with user search history fetched from an SQL database. Search results are stored in Elasticsearch, updating every six hours. Finally, Claude II generates a response from the search results and sends it back to the user.
I found the use of Claude II for prompt sanitization and geolocation, and the integration of multiple AI models through AWS Bedrock, particularly interesting. This setup keeps data private and provides a flexible, efficient job search experience.
Check out the video and share your thoughts!
#### Voice transcription and analysis on AWS

▶ [Click here to watch](https://youtu.be/RGXRjOTQuBM)
Hey everyone, I recorded a reaction video to an AWS architecture for voice transcription and analysis. Presented by Nuan from AWS and Ben from Assembly AI, this system is designed to handle large-scale audio data processing.
Users upload audio data via an API to an ECS container. The data is then managed by an orchestrator that decides which models to use and in what order. The orchestrator sends tasks to SQS, which triggers various ML models running on ECS. These models handle tasks like speech-to-text conversion, sentiment analysis, and speaker labeling. Results are stored in S3 and users are notified via SNS and a Lambda function when processing is complete.
I found the use of ECS for containerized applications and the flexibility of swapping models through ECR particularly interesting. This architecture ensures scalability and efficiency, making it ideal for handling millions of requests per day.
Check out the video and share your thoughts!
#### GeoSpatial Data Analysis

▶ [Click here to watch](https://youtu.be/MxVJAvFSTXg)
Hey everyone, I recorded a reaction video to an AWS architecture for geospatial data analysis by TCS. Presented by David John and Suryakant from TCS, this platform is used in next-gen agriculture for tasks like crop health, yield, and soil moisture analysis.
The platform uses data from satellites, AWS open data, and field agents, processing it with Lambda, Sagemaker, and PostgreSQL. Data is stored and analyzed in S3 buckets and PostgreSQL, with results made accessible via EKS-deployed UIs on EC2 instances, buffered through CloudFront for efficiency.
Key aspects include:
- Lambda functions triggering Sagemaker jobs for machine learning.
- Sagemaker handling extensive processing tasks.
- PostgreSQL and S3 for storing processed data.
- CloudFront caching data to enhance user experience.
- I found the use of parallel Sagemaker jobs for scalability and the integration of open data for cost efficiency particularly interesting. This setup effectively meets the agricultural sector's data analysis needs.
Check out the video and share your thoughts!
#### Building a Self-Service Enterprise Data Engineering Platform

▶ [Click here to watch](https://youtu.be/E9JFCl7bk88)
Hey everyone, I recorded a reaction video to an AWS architecture for a self-service enterprise data engineering platform by ZS Associates. Presented by David John and Laken from ZS Associates, this platform is designed to streamline data integration, infrastructure provisioning, and data access for life sciences companies.
Key components:
- **Users and Interaction**: Data engineers and analysts interact through a self-service web portal, selecting infrastructure types and providing project details. This portal makes REST requests to EKS, which creates records in PostgreSQL and triggers infrastructure provisioning via SQS.
- **Infrastructure Provisioning**: EKS processes SQS messages to provision infrastructure such as EMR clusters, databases in Glue Catalog, S3 buckets, and EC2 instances with containerized services like Airflow or NiFi. IAM roles are configured for access control.
- **Data Governance and Security**: All data sets are accessed through the Glue Catalog, with governance workflows requiring approval from data owners via SES notifications. EKS updates IAM roles and Ranger policies for fine-grained access control.
- **Scalability and Efficiency**: EKS hosts 100+ microservices supporting workflows and UI portals. The platform handles millions of API requests and hundreds of data access requests monthly, with auto-scaling capabilities to manage costs.
This architecture effectively reduces time to market, enhances security at scale, and optimizes costs by automating data access and infrastructure provisioning. It also ensures data governance and security through controlled access and approval processes.
Check out the video and share your thoughts!
#### Customer Support Platform

▶ [Click here to watch](https://youtu.be/sCIFpOuryFU)
Hey everyone, I recorded a reaction video to an AWS architecture for a personalized customer support platform by Traeger. Presented by David John and Lizzy from Traeger, this system enhances customer support by leveraging data from Shopify, EventBridge, Kinesis Data Firehose, S3, Lambda, DynamoDB, and Amazon Connect.
Key components:
- **Order Processing**: Customer order data from Shopify flows into EventBridge, then to Kinesis Data Firehose, which writes it to S3. An event trigger in S3 invokes a Lambda function that stores specific order metadata in DynamoDB.
- **Personalized Customer Support**: When a customer calls, Amazon Connect uses Pinpoint to determine the call's origin, personalizing the language options. Connect triggers a Lambda function to query DynamoDB for customer metadata based on the phone number. This data is used to inform the customer support agent.
- **Reason for Contact**: Amazon Lex bot asks the customer the reason for their call, and this information, along with customer metadata, routes the call to a specialized support queue.
I found the use of DynamoDB for storing customer metadata and the integration with Amazon Connect and Lex for personalized support particularly interesting. The architecture is scalable and ensures a personalized experience for customers.
Check out the video and share your thoughts!
#### League of Legends Data Platform on AWS

▶ [Click here to watch](https://youtu.be/FX_ZUJk_WoE)
Hey everyone, I recorded a reaction video to an AWS architecture for the data platform that powers League of Legends by Riot Games. Presented by David John and the team at Riot Games, this system handles massive amounts of data generated by millions of players worldwide.
Key components:
- **Player Interaction**: Players connect to game servers globally. The game client communicates with an API running in EKS. This setup ensures low latency and optimal performance.
- **Data Ingestion**: The game client and server send data about player interactions to EKS, which flows into MSK (Managed Streaming for Kafka). Local Kafka clusters buffer the data before it’s replicated to regional MSK clusters using MirrorMaker.
- **Data Processing**: Spark Streaming jobs process the data from MSK and store it in Delta Lake on S3. This setup ensures efficient data handling and reduces latency in data availability.
- **Data Storage and Access**: Glue serves as the data catalog, managing metadata and permissions. Data consumers, including analysts, designers, engineers, and executives, access this data through Databricks, leveraging Glue for structured queries.
I found the use of MSK and Spark for scalable data ingestion and processing particularly interesting. This architecture supports real-time analytics, allowing Riot Games to quickly assess the impact of new patches and gameplay changes.
Check out the video and share your thoughts!
#### Platform Connecting 70 Million Cars

▶ [Click here to watch](https://youtu.be/1nifzmvOGHs)
Hey everyone, I recorded a reaction video to an AWS architecture for a connected car platform by Mobileye. Presented by David John and the team at Mobileye, this system connects 70 million cars, collecting and processing data to offer digital services and fleet analysis.
Key components:
- **Data Collection**: Cars collect anonymized data using sensors and visual inspections, sending it to a REST API and storing it in S3.
- **Data Processing**: The data is pulled from S3 into SQS and processed by EKS workers, which scale according to the queue size. Processed data is stored back in S3 and further analyzed using step functions and Lambda for tasks like extracting construction zones and clustering observations.
- **Data Storage**: Processed data is stored in S3, Elasticsearch, and CockroachDB. Elasticsearch handles document-based data with self-indexing, while CockroachDB supports frequent updates.
- **Data Consumption**: EKS hosts a secured REST API and web application, allowing customers like city planners to access insights on pedestrian and bicycle traffic.
Future plans include enabling cloud image processing on EKS with GPU instances and focusing on cost reduction as data flow increases.
I found the use of EKS for scalable data processing and the combination of Elasticsearch and CockroachDB for different data needs particularly interesting. This architecture efficiently handles large-scale data from millions of connected cars.
Check out the video and share your thoughts!
#### 55TB A Day: Nielsen AWS Data Architecture

▶ [Click here to watch](https://youtu.be/WCQe1VP_q5A)
Hey everyone, I recorded a reaction video to an AWS architecture for Nielsen Marketing Cloud, which processes 55TB of data daily. Presented by David John, this system handles marketing segmentation data for campaigns.
Key components:
- **Data Ingestion**: Marketing data comes in files, written to S3. Spark on EMR processes and transforms the data, writing the output to another S3 bucket.
- **Data Processing**: Lambda functions handle the final formatting and upload the data to over 100 ad networks. Metadata about file processing is managed in a PostgreSQL RDS database.
- **Metadata Management**: A work manager Lambda reads metadata from RDS, triggers processing jobs in EMR, and updates the metadata post-processing.
- **Scaling and Rate Limiting**: The serverless architecture allows automatic scaling. However, rate limiting is implemented to prevent overloading ad networks, ensuring they handle data bursts smoothly.
Challenges and Solutions:
- **Scale**: The system handles 250 billion events per day, scaling up and down automatically to manage peak loads.
- **Rate Limiting**: To avoid overwhelming ad networks, a rate-limiting mechanism was introduced, managing data flow based on network capacity.
- **Back Pressure Management**: SQS is used to buffer Lambda responses, preventing direct overload on the PostgreSQL database.
I found the use of SQS for metadata management and the serverless architecture for handling massive data loads particularly interesting. This setup ensures efficient data processing and smooth delivery to ad networks.
Check out the video and share your thoughts!
#### Orange Theory Fitness

▶ [Click here to watch](https://youtu.be/ssaXRo5s1r4)
Hey, everybody! Today, I'm reacting to the AWS data infrastructure at Orange Theory Fitness, where they collect data from wristbands and training machines. Let's dive in and see how they manage it all.
### Key Components
1. **Local Server**: Aggregates data from in-studio equipment and mobile apps, ensuring resiliency if the cloud connection is lost.
2. **API Gateway and Cognito**: Handle authentication and route data to the cloud.
3. **Lambda Functions**: Process data.
4. **Aurora RDS (MySQL)**: Stores structured data like member profiles, class bookings, and studio information.
5. **DynamoDB**: Stores performance metrics and workout statistics for quick access.
6. **S3**: Serves as a data lake, storing telemetry data.
7. **Kinesis Firehose**: Streams telemetry data to S3.
### Challenges & Solutions
1. **Resiliency**
- **Challenge**: Ensure operations continue if cloud connection is lost.
- **Solution**: Local server aggregates data and syncs with the cloud once the connection is restored.
2. **Data Integration**
- **Challenge**: Integrate data from various sources.
- **Solution**: Use API Gateway and Cognito for unified authentication and data routing.
3. **Data Processing**
- **Challenge**: Efficiently process and store different types of data.
- **Solution**: Use Lambda for processing, Aurora RDS for structured data, DynamoDB for quick access to performance metrics, and Kinesis Firehose with S3 for streaming and storing large volumes of telemetry data.
This architecture leverages AWS tools for scalability, flexibility, and resilience, making it an excellent example of a well-thought-out data infrastructure for a fitness application.
Let me know your thoughts in the comments. What do you think of this architecture? Would you have done anything differently? If you have any questions, feel free to ask. And if you're interested in learning more about data engineering, check out my academy at learndataengineering.com. See you in the next video!
## More Details
AWS Whitepapers:
[https://d1.awsstatic.com/whitepapers/aws-overview.pdf](https://d1.awsstatic.com/whitepapers/aws-overview.pdf)
# Azure
## Connect
- Event Hub
- IoT Hub
## Buffer
- Data Factory
- Event Hub
- RedisCache (also Store)
## Processing
- Stream Analytics Service
- Azure Databricks
- Machine Learning
- Azure Functions
- Azure HDInsight (Hadoop PaaS)
## Store
- Blob
- CosmosDB
- MariaDB
- MySQL
- PostgreSQL
- SQL
- Azure Data lake
- Azure Storage (SQL Table?)
- Azure Synapse Analytics
## Visualize
- PowerBI
## Containerization
- Virtual Machines
- Virtual Machine Scale Sets
- Azure Container Service (AKS)
- Container Instances
- Azure Kubernetes Service
## Best Practices
Advanced Analytics Architecture:
[https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/advanced-analytics-on-big-data](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/advanced-analytics-on-big-data)
Anomaly Detection in Real-time Data Streams:
[https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/anomaly-detection-in-real-time-data-streams](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/anomaly-detection-in-real-time-data-streams)
Modern Data Warehouse Architecture:
[https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/modern-data-warehouse](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/modern-data-warehouse)
CI/CD for Containers:
[https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/cicd-for-containers](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/cicd-for-containers)
Real Time Analytics on Big Data Architecture:
[https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/real-time-analytics](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/real-time-analytics)
Anomaly Detection in Real-time Data Streams:
[https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/anomaly-detection-in-real-time-data-streams](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/anomaly-detection-in-real-time-data-streams)
IoT Architecture – Azure IoT Subsystems:
[https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/azure-iot-subsystems](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/azure-iot-subsystems)
Tier Applications & Data for Analytics:
[https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/tiered-data-for-analytics](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/tiered-data-for-analytics)
Extract, transform, and load (ETL) using HDInsight:
[https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/extract-transform-and-load-using-hdinsight](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/extract-transform-and-load-using-hdinsight)
IoT using Cosmos DB:
[https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/iot-using-cosmos-db](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/iot-using-cosmos-db)
Streaming using HDInsight:
[https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/streaming-using-hdinsight](https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/streaming-using-hdinsight)
# GCP
## Connect
- Cloud IoT Core
- App Engine
- Cloud Dataflow
## Buffer
- Pub/Sub
## Processing
- Compute Engine
- Cloud Functions
- Specialized tools:
- Cloud Dataflow
- Cloud Dataproc
- Cloud Datalab
- Cloud Dataprep
- Cloud Composer
- App Engine
## Store
- Cloud Storage
- Cloud SQL
- Cloud Spanner
- Cloud Datastore
- Cloud BigTable
- Cloud Storage
- Cloud Memorystore
- BigQuery
## Visualize
## Containerization
- Kubernetes Engine
- Container Security
## Best Practices
Thanks to Ismail Holoubi for the following GCP links
Best practices for migrating virtual machines to Compute Engine:
https://cloud.google.com/solutions/best-practices-migrating-vm-to-compute-engine
Best practices for Cloud Storage:
https://cloud.google.com/storage/docs/best-practices
Moving a publishing workflow to BigQuery for new data insights:
https://cloud.google.com/blog/products/data-analytics/moving-a-publishing-workflow-to-bigquery-for-new-data-insights
Architecture: Optimizing large-scale ingestion of analytics events and logs:
https://cloud.google.com/solutions/architecture/optimized-large-scale-analytics-ingestion
Choosing the right architecture for global data distribution:
https://cloud.google.com/solutions/architecture/global-data-distribution
Best Practices for Operating Containers:
https://cloud.google.com/solutions/best-practices-for-operating-containers
Automating IoT Machine Learning: Bridging Cloud and Device Benefits with AI Platform:
https://cloud.google.com/solutions/automating-iot-machine-learning
================================================
FILE: sections/07-DataSources.md
================================================
100 Plus Data Sources Data Science
===================================
This is a section with links to data sources. During my data engineer coaching we need to find good data sets to work with.
So, I started this section to make it easier to find good sources.
I've taken these links from articles and blog posts. Why not only link the articles?
You know, these posts can go away at any time. I want to keep the links to the platforms either way.
I haven't had the chance to check each link myself. Please let me know if something isn't right.
You can find the articles on the bottom of this section to read more. They include even more data sources I haven't had time to add to this list.
## Contents
- [Student Favorites](07-DataSources.md#Student-Favorites)
- [Content Marketing](07-DataSources.md#Content-Marketing)
- [Crime](07-DataSources.md#Crime)
- [Drugs](07-DataSources.md#Drugs)
- [Education](07-DataSources.md#Education)
- [Entertainment](07-DataSources.md#Entertainment)
- [Environmental And Weather Data](07-DataSources.md#Environmental-And-Weather-Data)
- [Financial And Economic Data](07-DataSources.md#Financial-And-Economic-Data])
- [General And Academic](07-DataSources.md#General-And-Academic)
- [Government And World](07-DataSources.md#Government-And-World)
- [Health](07-DataSources.md#Health)
- [Human Rights](07-DataSources.md#Human-Rights)
- [Labor And Employment Data](07-DataSources.md#Labor-And-Employment-Data)
- [Politics](07-DataSources.md#Politics)
- [Retail](07-DataSources.md#Retail)
- [Social](07-DataSources.md#Social)
- [Source Articles and Blog Posts](07-DataSources.md#Source-Articles-and-Blog-Posts)
- [Travel And Transportation](07-DataSources.md#Travel-And-Transportation)
- [Various Portals](07-DataSources.md#Various-Portals)
## Student Favorites
In my Coaching program my students learn by doing a project. And the foundation of every project is selecting a dataset.
That can be an API or a file source, depending a lot on the student's goals and interests.
Working out goals for the dataset, figuring out the data modeling, creating the architecture and building it.
It's a fun way to learn and get better at Data Engineering.
Here's a list of my student's favorite datasets and APIs
Learn more about the Coaching program: [click here](https://learndataengineering.com/p/data-engineering-coaching)
### Datasets
- [Fraud detection](https://www.kaggle.com/datasets/kartik2112/fraud-detection)
- [Industrial equipment monitoring](https://www.kaggle.com/datasets/dnkumars/industrial-equipment-monitoring-dataset)
- [Energy demand & generation](https://www.kaggle.com/datasets/nicholasjhana/energy-consumption-generation-prices-and-weather?select=weather_features.csv)
- [Online Retail](https://www.kaggle.com/datasets/tunguz/online-retail)
- [Brazilian E-commerce](https://www.kaggle.com/datasets/olistbr/brazilian-ecommerce)
- [Beijing Air Quality](https://www.kaggle.com/datasets/sid321axn/beijing-multisite-airquality-data-set)
- [NYC Taxi](https://www.kaggle.com/datasets/diishasiing/revenue-for-cab-drivers)
### APIs
- [Bike sharing Bluebikes](https://bluebikes.com/system-data)
- [Bike sharing Divvy Bikes](https://divvybikes.com/system-data)
- [Weather API](https://www.weatherapi.com/docs/)
- [Bluesky API](https://docs.bsky.app/docs/advanced-guides/api-directory)
- [Guardian news API](https://open-platform.theguardian.com/)
- [Football API](https://www.api-football.com/)
## General And Academic
- [Amazon Public Data Sets](https://registry.opendata.aws/)
- [Datasets Subreddit](https://www.reddit.com/r/datasets)
- [Enigma Public](https://public.enigma.com/)
- [FiveThirtyEight](http://fivethirtyeight.com/)
- [Google Scholar](http://scholar.google.com/)
- [Pew Research](http://www.pewresearch.org/)
- [The Upshot by New York Times](http://www.nytimes.com/section/upshot)
- [UNData](http://data.un.org/)
## Content Marketing
- [Buffer](https://blog.bufferapp.com/)
- [Content Marketing Institute](http://contentmarketinginstitute.com/about/)
- [HubSpot](http://www.hubspot.com/marketing-statistics)
- [Moz](https://moz.com/blog)
## Crime
- [Bureau of Justice Statistics](http://www.bjs.gov/index.cfm?ty=dca)
- [FBI Crime Statistics](https://www.fbi.gov/stats-services/crimestats)
- [National Archive of Criminal Justice Data](https://www.icpsr.umich.edu/icpsrweb/NACJD/)
- [Uniform Crime Reporting Statistics](https://crime-data-explorer.fr.cloud.gov/)
## Drugs
- [Drug Data and Database by First Databank](http://www.fdbhealth.com/)
- [Drug War Facts](http://www.drugwarfacts.org/)
- [National Institute on Drug Abuse](https://www.drugabuse.gov/related-topics/trends-statistics)
- [U.S. Food and Drug Administration](http://www.fda.gov/Drugs/InformationOnDrugs/ucm079750.htm)
- [United Nations Office on Drugs and Crime](https://www.unodc.org/unodc/en/data-and-analysis/)
## Education
- [Education Data by the World Bank](http://data.worldbank.org/topic/education)
- [Education Data by Unicef](http://data.unicef.org/education/overview.html)
- [National Center for Education Statistics](https://nces.ed.gov/)
## Entertainment
- [Academic Rights Press](http://www.academicrightspress.com/entertainment/music)
- [BFI Film Forever](http://www.bfi.org.uk/education-research/film-industry-statistics-research)
- [BLS: Arts, Entertainment, and Recreation](http://www.bls.gov/iag/tgs/iag71.htm)
- [IFPI](http://www.ifpi.org/global-statistics.php)
- [Million Song Dataset](https://aws.amazon.com/datasets/million-song-dataset/)
- [Statista: Film Industry](http://www.statista.com/topics/964/film/)
- [Statista: Music Industry](http://www.statista.com/topics/1639/music/)
- [Statista: Video Game Industry](http://www.statista.com/topics/868/video-games/)
- [The Numbers](http://www.the-numbers.com/)
## Environmental And Weather Data
- [Environmental Protection Agency](https://www.epa.gov/data)
- [International Energy Agency Atlas](https://www.iea.org/data-and-statistics?country=WORLD&fuel=Energy%20supply&indicator=TPESbySource)
- [National Center for Environmental Health](http://www.cdc.gov/nceh/data.htm)
- [National Climatic Data Center](http://www.ncdc.noaa.gov/data-access/quick-links#loc-clim)
- [National Weather Service](http://www.weather.gov/help-past-weather)
- [Weather Underground](https://www.wunderground.com/)
- [WeatherBase](http://www.weatherbase.com/)
## Financial And Economic Data
- [Federal Reserve Economic Database](https://fred.stlouisfed.org/)
- [Financial Data Finder at OSU](./) - Missing link.
- [Global Financial Data](https://www.globalfinancialdata.com/index.html)
- [Google Finance](https://www.google.com/finance)
- [Google Public Data Explorer](http://www.google.com/publicdata/directory)
- [IMF Economic Data](https://data.imf.org/?sk=388dfa60-1d26-4ade-b505-a05a558d9a42)
- [National Bureau of Economic Research](http://www.nber.org/data/)
- [OpenCorporates](https://opencorporates.com/)
- [The Atlas of Economic Complexity](http://atlas.cid.harvard.edu/)
- [U.S. Bureau of Economic Analysis](http://www.bea.gov/)
- [U.S. Securities and Exchange Commission](https://www.sec.gov/dera/data/financial-statement-data-sets.html)
- [UN Comtrade Database](https://comtrade.un.org/labs/)
- [Visualizing Economics](http://visualizingeconomics.com/)
- [World Bank Doing Business Database](http://www.doingbusiness.org/rankings)
- [World Bank Open Data](http://data.worldbank.org/)
## Government And World
- [Data.gov](http://www.data.gov/)
- [European Union Open Data Portal](http://data.europa.eu/euodp/en/data/)
- [Gapminder](https://www.gapminder.org/data/)
- [Land Matrix (Transnational Land Database)](http://landmatrix.org/en/)
- [OECD Aid Database](http://www.oecd.org/dac/financing-sustainable-development/development-finance-data/)
- [Open Data Network](http://www.opendatanetwork.com/)
- [The CIA World Factbook](https://www.cia.gov/the-world-factbook/)
- [The World Bank’s World Development Indicators](http://data.worldbank.org/data-catalog/world-development-indicators)
- [U.S. Census Bureau](http://www.census.gov/)
- [UNDP’s Human Development Index](http://hdr.undp.org/en/data)
## Health
- [America’s Health Rankings](http://www.americashealthrankings.org/)
- [Centers for Disease Control and Prevention](http://www.cdc.gov/datastatistics/)
- [Health & Social Care Information Centre](http://www.hscic.gov.uk/home)
- [Health Services Research Information Central](https://www.nlm.nih.gov/hsrinfo/datasites.html)
- [HealthData.gov](https://www.healthdata.gov/)
- [Medicare Hospital Quality](https://data.medicare.gov/data/hospital-compare#)
- [MedicinePlus](https://www.nlm.nih.gov/medlineplus/healthstatistics.html)
- [National Center for Health Statistics](http://www.cdc.gov/nchs/)
- [SEER Cancer Incidence](http://seer.cancer.gov/faststats/selections.php?series=cancer)
- [World Health Organization](http://www.who.int/en/)
## Human Rights
- [Amnesty International](https://www.amnesty.org/en/search/?q=&documentType=Annual+Report)
- [Human Rights Data Analysis Group](https://hrdag.org/)
- [The Armed Conflict Database by Uppsala University](http://www.pcr.uu.se/research/UCDP/)
## Labor And Employment Data
- [Bureau of Labor Statistics](http://www.bls.gov/)
- [Department of Labor](https://www.dol.gov/general/topic/statistics/employment)
- [Employment by U.S. Census](http://www.census.gov/topics/employment.html)
- [U.S. Small Business Administration](https://www.sba.gov/starting-business/how-start-business/business-data-statistics/employment-statistics)
## Politics
- [California Field Poll](http://dlab.berkeley.edu/data-resources/california-polls)
- [Crowdpac](https://www.crowdpac.com/)
- [Gallup](http://www.gallup.com/home.aspx)
- [Open Secrets](https://www.opensecrets.org/)
- [Rand State Statistics](http://www.randstatestats.org/us/)
- [Real Clear Politics](http://guides.lib.berkeley.edu/Intro-to-Political-Science-Research/Stats)
- [Roper Center for Public Opinion Research](https://ropercenter.cornell.edu/)
- [US Voter Files](http://voterlist.electproject.org/) Note only some states are free, and most do not allow voter files to be used for commercial purposes - this map allows you to see the rules/cost for each state.
## Retail
- [Love the Sales](https://www.lovethesales.com/press/data-request)
## Social
- [Facebook Graph API](https://developers.facebook.com/docs/graph-api)
- [Google Trends](http://www.google.com/trends/explore)
- [SocialMention](./) - Missing link.
## Travel And Transportation
- [Bureau of Transportation Statistics](https://www.bts.gov/browse-statistical-products-and-data)
- [Monthly Tourism Statistics – U.S. Travelers Overseas](http://travel.trade.gov/research/monthly/departures/)
- [Search the World](http://www.geoba.se/)
- [SkiftStats](https://skift.com/skiftx/skiftstats/)
- [U.S. Travel Association](https://www.ustravel.org/research)
## Various Portals
- [Ckan](https://ckan.org/)
- [Dataverse](https://dataverse.org/)
- [DBpedia](https://wiki.dbpedia.org/)
- [freeCodeCamp Open Data](https://github.com/freeCodeCamp/open-data)
- [Kaggle](https://www.kaggle.com/datasets)
- [LODUM](https://lodum.de/)
- [Open Data Ipact Map](http://opendataimpactmap.org/)
- [Open Data Kit](https://opendatakit.org/)
- [Open Data Monitor](https://opendatamonitor.eu/frontend/web/index.php?r=dashboard%2Findex)
- [Plenar.io](http://plenar.io/)
- [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php)
- [Yelp Open Datasets](https://www.yelp.com/dataset)
## Source Articles and Blog Posts
- [100+ of the Best Free Data Sources For Your Next Project](https://www.columnfivemedia.com/100-best-free-data-sources-infographic)
- [15 Great Free Data Sources for 2016](https://medium.com/@Infogram/15-great-free-data-sources-for-2016-25cb455db257)
- [20 Awesome Sources of Free Data](https://www.searchenginejournal.com/free-data-sources/302601/#close)
- [30+ Free Data Sources Every Company Should Be Aware Of](https://www.bernardmarr.com/default.asp?contentID=960)
- [50 Amazing Free Data Sources You Should Know](https://infogram.com/blog/free-data-sources/)
- [50 Best Open Data Sources Ready to be Used Right Now](https://learn.g2.com/open-data-sources)
- [70 Amazing Free Data Sources You Should Know](https://www.kdnuggets.com/2017/12/big-data-free-sources.html)
- [Big Data: 33 Brilliant And Free Data Sources Anyone Can Use](https://www.forbes.com/sites/bernardmarr/2016/02/12/big-data-35-brilliant-and-free-data-sources-for-2016/#527557ffb54d)
- [These Are The Best Free Open Data Sources Anyone Can Use](https://www.freecodecamp.org/news/https-medium-freecodecamp-org-best-free-open-data-sources-anyone-can-use-a65b514b0f2d/)
================================================
FILE: sections/08-InterviewQuestions.md
================================================
1001 Data Engineering Interview Questions
=========================================
Hey everyone, this collection of questions and answers is a work in progress.
I'm going to keep adding Q&As, but you are invited to collaborate through [GitHub](https://github.com/andkret/Cookbook):
- Eiter clone this repo, make your changes and create a pull request
- or raise an issue on GitHub with your questions and answers and we'll add them
Andreas
## Contents:
- [Python](10-InterviewQuestions.md#python)
- [SQL](10-InterviewQuestions.md#sql)
- [Integrate](10-InterviewQuestions.md#integrate)
- [APIs](10-InterviewQuestions.md#apis)
- [Message Queues](10-InterviewQuestions.md#message-queues)
- [Distributed Message Queues](10-InterviewQuestions.md#distributed-message-queues)
- [Message Queues (Fifo)](10-InterviewQuestions.md#integrate)
- [Caches](10-InterviewQuestions.md#caches)
- [Data Processing](10-InterviewQuestions.md#data-processing)
- [ETL](10-InterviewQuestions.md#etl)
- [Stream Processing](10-InterviewQuestions.md#stream-processing)
- [Batch Processing](10-InterviewQuestions.md#batch-processing)
- [Processing Frameworks](10-InterviewQuestions.md#processing-frameworks)
- [Serverless](10-InterviewQuestions.md#serverless)
- [Distributed Processing Frameworks](10-InterviewQuestions.md#distributed-processing-frameworks)
- [Scheduling](10-InterviewQuestions.md#scheduling)
- [Airflow](10-InterviewQuestions.md#airflow)
- [CI-CD](10-InterviewQuestions.md#ci-cd)
- [Docker](10-InterviewQuestions.md#docker)
- [Kubernetes](10-InterviewQuestions.md#kubernetes)
- [Data Storage](10-InterviewQuestions.md#data-storage)
- [Relational Databases](10-InterviewQuestions.md#relational-databases)
- [NoSQL](10-InterviewQuestions.md#nosql)
- [Analytical Stores](10-InterviewQuestions.md#analytical-stores)
- [Relational Modeling](10-InterviewQuestions.md#relational-modeling)
- [Dimensional Data Modeling](10-InterviewQuestions.md#dimensional-modeling)
- [Data Lakes](10-InterviewQuestions.md#data-lakes)
- [Data Platforms](10-InterviewQuestions.md#data-platforms)
- [AWS](10-InterviewQuestions.md#aws)
- [Azure](10-InterviewQuestions.md#azure)
- [GCP](10-InterviewQuestions.md#gcp)
- [Snowflake](10-InterviewQuestions.md#snowflake)
### Python
1. **What is Apache Spark, and how can you use it with Python?**
- **Answer**: Apache Spark is a distributed data processing framework that allows for big data processing with in-memory computing capabilities. You can use it with Python through PySpark, which provides a Python API for Spark. PySpark enables data engineers to write Spark applications in Python.
2. **How do you perform data cleaning in Python?**
- **Answer**: Data cleaning in Python can be performed using the `pandas` library. Common tasks include handling missing values (`dropna`, `fillna`), removing duplicates (`drop_duplicates`), converting data types, normalizing data, and handling outliers. Example:
```python
import pandas as pd
df = pd.read_csv('data.csv')
df.dropna(inplace=True) # Remove rows with missing values
df['column'] = df['column'].astype(int) # Convert column to integer type
```
3. **Explain how you would optimize a slow-running SQL query within a Python ETL pipeline.**
- **Answer**: To optimize a slow-running SQL query, you can:
- Analyze the query execution plan.
- Add appropriate indexes.
- Optimize the query by reducing complexity, such as using JOINs efficiently and avoiding unnecessary subqueries.
- Partition large tables if applicable.
- Use caching and materialized views for frequently accessed data.
- Ensure that statistics are up to date.
Example with SQLAlchemy:
```python
from sqlalchemy import create_engine
engine = create_engine('postgresql://user:password@localhost/dbname')
with engine.connect() as connection:
result = connection.execute('SELECT * FROM table WHERE condition')
data = result.fetchall()
```
4. **What is the role of a workflow scheduler in data engineering, and can you name some common ones?**
- **Answer**: A workflow scheduler automates and manages the execution of ETL jobs and data pipelines. It ensures tasks are executed in the correct order and handles retries, dependencies, and monitoring. Common workflow schedulers include Apache Airflow, Luigi, Prefect, and Apache NiFi.
5. **How do you handle schema changes in a data pipeline?**
- **Answer**: Handling schema changes in a data pipeline involves:
- Implementing schema evolution techniques.
- Using tools like Apache Avro, which supports schema evolution.
- Versioning schemas and ensuring backward compatibility.
- Monitoring and validating incoming data against the schema.
- Applying transformations to adapt to new schemas.
Example with Avro:
```python
from avro.datafile import DataFileReader
from avro.io import DatumReader
reader = DataFileReader(open("data.avro", "rb"), DatumReader())
for record in reader:
print(record)
reader.close()
```
6. **What is data partitioning, and why is it important in data engineering?**
- **Answer**: Data partitioning is the process of dividing a large dataset into smaller, more manageable pieces, often based on a key such as date, user ID, or geographic location. Partitioning improves query performance by reducing the amount of data scanned and allows for parallel processing. It also helps in managing large datasets and reducing I/O costs.
7. **How do you ensure data quality in your pipelines?**
- **Answer**: Ensuring data quality involves:
- Implementing data validation checks (e.g., constraints, data type checks).
- Monitoring for data anomalies and inconsistencies.
- Using data profiling tools to understand the data.
- Creating unit tests for data processing logic.
- Automating data quality checks and alerting mechanisms.
Example with `pandas` for data validation:
```python
import pandas as pd
df = pd.read_csv('data.csv')
assert df['column'].notnull().all(), "Missing values found in column"
assert (df['age'] >= 0).all(), "Negative ages found"
```
8. **What is the difference between batch processing and stream processing?**
- **Answer**: Batch processing involves processing large volumes of data at once, usually at scheduled intervals. It is suitable for tasks that are not time-sensitive. Stream processing, on the other hand, involves processing data in real-time as it arrives, which is suitable for time-sensitive applications such as real-time analytics, monitoring, and alerts.
9. **How do you implement logging and monitoring in your data pipelines?**
- **Answer**: Logging and monitoring can be implemented using:
- Logging libraries like Python's `logging` module to capture and store logs.
- Monitoring tools like Prometheus, Grafana, or ELK Stack (Elasticsearch, Logstash, Kibana) to visualize and monitor logs.
- Setting up alerts for failures or anomalies.
Example with Python's `logging` module:
```python
import logging
logging.basicConfig(filename='pipeline.log', level=logging.INFO)
logging.info('This is an informational message')
logging.error('This is an error message')
```
10. **What are some common challenges you face with distributed data processing, and how do you address them?**
- **Answer**: Common challenges with distributed data processing include data consistency, fault tolerance, data shuffling, and latency. To address these:
- Use distributed processing frameworks like Apache Spark, which handle many of these issues internally.
- Implement robust error handling and retries.
- Optimize data shuffling by partitioning data effectively.
- Use caching mechanisms to reduce latency.
- Ensure proper resource allocation and scaling to handle large data volumes.
## SQL
## Integrate
### APIs
These questions cover a range of topics related to APIs, including their concepts, security, best practices, and specific implementation details.
1. **What is an API and how does it work?**
- **Answer**: An API (Application Programming Interface) is a set of rules and protocols for building and interacting with software applications. It allows different software systems to communicate with each other. APIs define the methods and data formats that applications can use to request and exchange data.
2. **What are the different types of APIs?**
- **Answer**: The main types of APIs include:
- **Open APIs (Public APIs)**: Available to developers and other users with minimal restrictions.
- **Internal APIs (Private APIs)**: Used within an organization to connect systems and data internally.
- **Partner APIs**: Shared with specific business partners and offer more control over how data is exposed.
- **Composite APIs**: Combine multiple API requests into a single call, allowing multiple data or service requests in one API call.
3. **What is REST and how does it differ from SOAP?**
- **Answer**: REST (Representational State Transfer) and SOAP (Simple Object Access Protocol) are two different approaches to building APIs. REST uses standard HTTP methods (GET, POST, PUT, DELETE) and is stateless, meaning each request from a client to a server must contain all the information needed to understand and process the request. SOAP, on the other hand, is a protocol that relies on XML-based messaging and includes built-in rules for security and transactions.
4. **Explain the concept of RESTful services.**
- **Answer**: RESTful services are web services that follow the principles of REST. These principles include:
- **Statelessness**: Each request from a client must contain all the information needed by the server to process the request.
- **Client-Server Architecture**: The client and server are separate entities, and they communicate over a network via standard HTTP.
- **Cacheability**: Responses from the server can be cached by the client or intermediate proxies to improve performance.
- **Uniform Interface**: Resources are identified in the request (usually via URIs), and actions are performed using standard HTTP methods.
5. **What is an API gateway and why is it used?**
- **Answer**: An API gateway is a server that acts as an intermediary for requests from clients seeking resources from backend services. It provides various functions such as request routing, composition, protocol translation, and handling of cross-cutting concerns like authentication, authorization, logging, monitoring, and rate limiting. It simplifies the client interface and improves security, scalability, and manageability of API services.
6. **How do you ensure the security of an API?**
- **Answer**: Ensuring API security involves several practices, including:
- **Authentication**: Verify the identity of the user or system making the request (e.g., using OAuth, JWT).
- **Authorization**: Ensure the authenticated user or system has permission to perform the requested action.
- **Encryption**: Use HTTPS to encrypt data in transit between the client and server.
- **Rate Limiting**: Prevent abuse by limiting the number of requests a client can make in a given time period.
- **Input Validation**: Validate and sanitize all inputs to prevent injection attacks.
- **Logging and Monitoring**: Track API usage and monitor for unusual or suspicious activity.
7. **What is versioning in APIs and how is it typically managed?**
- **Answer**: API versioning is the practice of managing changes to an API without disrupting existing clients. It can be managed in several ways, including:
- **URI Versioning**: Including the version number in the URI path (e.g., `/v1/resource`).
- **Query Parameter Versioning**: Including the version number as a query parameter (e.g., `/resource?version=1`).
- **Header Versioning**: Including the version number in the HTTP headers (e.g., `Accept: application/vnd.example.v1+json`).
8. **What are HTTP status codes and why are they important in API responses?**
- **Answer**: HTTP status codes are standardized codes returned by a server to indicate the result of a client's request. They are important because they provide meaningful feedback to the client about what happened with their request. Common status codes include:
- **200 OK**: The request was successful.
- **201 Created**: A resource was successfully created.
- **400 Bad Request**: The request was invalid or cannot be processed.
- **401 Unauthorized**: Authentication is required and has failed or has not yet been provided.
- **404 Not Found**: The requested resource could not be found.
- **500 Internal Server Error**: An error occurred on the server.
9. **Explain the concept of idempotency in RESTful APIs.**
- **Answer**: Idempotency refers to the property of certain operations whereby performing the same operation multiple times results in the same outcome. In RESTful APIs, methods like GET, PUT, and DELETE are idempotent because making the same request multiple times has the same effect as making it once. POST is not idempotent because multiple requests could create multiple resources.
10. **How do you handle pagination in APIs?**
- **Answer**: Pagination is used to split large sets of data into manageable chunks. Common methods for handling pagination include:
- **Offset and Limit**: Using query parameters to specify the starting point and number of records to return (e.g., `?offset=0&limit=10`).
- **Page Number and Size**: Using query parameters to specify the page number and the number of records per page (e.g., `?page=1&size=10`).
- **Cursor-Based Pagination**: Using a cursor (a pointer to a specific record) to fetch the next set of results (e.g., `?cursor=abc123`).
These additional questions cover more advanced topics related to APIs, including security, design principles, best practices, and tooling.
11. **What is the difference between synchronous and asynchronous API calls?**
- **Answer**: Synchronous API calls wait for the response before continuing, blocking the execution of code until the operation completes. Asynchronous API calls, on the other hand, do not block the execution; they allow the code to continue running and handle the response once it arrives, typically through callbacks, promises, or async/await patterns.
12. **What is a webhook, and how does it differ from an API endpoint?**
- **Answer**: A webhook is a way for an application to provide other applications with real-time information. A webhook is a "callback" that allows the sending application to push data to the receiving application when an event occurs. Unlike traditional API endpoints, which require the client to periodically check for data (polling), webhooks enable the server to push data to the client when an event occurs.
13. **What is CORS, and why is it important in the context of APIs?**
- **Answer**: CORS (Cross-Origin Resource Sharing) is a security feature implemented in web browsers that restricts web pages from making requests to a different domain than the one that served the web page. It is important in APIs to control how resources on a server are accessed by external domains. Proper CORS configuration ensures that only authorized domains can access API resources.
14. **What is the purpose of API documentation, and what should it include?**
- **Answer**: API documentation provides developers with the information they need to use and integrate with an API effectively. It should include:
- An overview of the API and its purpose.
- Authentication and authorization methods.
- Endpoint definitions and available methods (GET, POST, PUT, DELETE).
- Request and response formats (including headers, query parameters, and body data).
- Error codes and their meanings.
- Examples of requests and responses.
- Rate limits and usage policies.
15. **What are API gateways, and what role do they play in API management?**
- **Answer**: API gateways act as intermediaries between clients and backend services. They provide various functions such as request routing, load balancing, security (authentication and authorization), rate limiting, logging, monitoring, and transforming requests and responses. API gateways simplify client interactions with microservices and help manage and secure APIs.
16. **How do you handle authentication and authorization in APIs?**
- **Answer**: Authentication verifies the identity of a user or application, while authorization determines what resources and operations they have access to. Common methods for handling authentication and authorization in APIs include:
- API keys: Simple tokens provided to access the API.
- OAuth: An open standard for token-based authentication and authorization.
- JWT (JSON Web Tokens): A compact, URL-safe means of representing claims to be transferred between two parties.
- Basic Auth: A simple method using a username and password encoded in base64.
17. **What is the concept of rate limiting in APIs, and why is it important?**
- **Answer**: Rate limiting controls the number of requests a client can make to an API within a specified time period. It is important for:
- Preventing abuse and overuse of API resources.
- Ensuring fair usage among clients.
- Protecting the backend services from being overwhelmed.
- Managing and maintaining service quality and performance.
18. **Explain the concept of API throttling.**
- **Answer**: API throttling is the process of controlling the usage rate of an API by limiting the number of requests a client can make within a certain timeframe. Throttling helps prevent abuse, protects resources, and ensures that the service remains available and responsive to all users. It can be implemented using techniques such as rate limits, quotas, and burst control.
19. **What is HATEOAS and how does it relate to RESTful APIs?**
- **Answer**: HATEOAS (Hypermedia As The Engine Of Application State) is a constraint of RESTful APIs where hypermedia links are included in the responses to guide clients through the API. It allows clients to dynamically discover available actions and navigate the API without hardcoding the structure. For example, a response to a GET request for a user resource might include links to update or delete the user.
20. **What are some common tools and platforms for testing and documenting APIs?**
- **Answer**: Common tools and platforms for testing and documenting APIs include:
- **Postman**: A popular tool for developing, testing, and documenting APIs.
- **Swagger/OpenAPI**: A framework for designing, building, and documenting RESTful APIs, often used with tools like Swagger UI and Swagger Editor.
- **Insomnia**: An API client for testing RESTful and GraphQL APIs.
- **Apigee**: An API management platform providing tools for API design, security, analytics, and monitoring.
- **Paw**: A macOS-based API client for testing and documenting APIs.
- **RAML (RESTful API Modeling Language)**: A language for designing and documenting APIs.
## Message queues
### Distributed Message Queues
### Message Queues (Fifo)
### Caches
## Data Processing
### ETL
### Stream processing
### Batch processing
### Processing Frameworks
#### Serverless
#### Distributed Processing frameworks
### Scheduling
#### Airflow
### Docker and Kubernetes
### CI-CD
## Data Storage
### Relational Databases
### NoSQL
### Analytical Stores
### Relational Modeling
### Dimensional Data Modeling
### Data Lakes
## Data Platforms
### AWS
### GCP
### Azure
### Snowflake
Looking for a job or just want to know what people find important? In
this chapter you can find a lot of interview questions we collect on the
stream.
Ultimately this should reach at least one thousand and one questions.
**But Andreas, where are the answers??** Answers are for losers. I have
been thinking a lot about this and the best way for you to prepare and
learn is to look into these questions yourself.
This cookbook or Google will help you a long way. Some questions we
discuss directly on the live stream.
Live Streams
------------
First live stream where we started to collect these questions.
| Podcast Episode: #096 1001 Data Engineering Interview Questions
|------------------|
|First live stream where we collect and try to answer as many interview questions as possible. If this helps people and is fun we do this regularly until we reach 1000 and one.
| [Watch on YouTube](https://youtu.be/WbqRH2r3N40)
All Interview Questions
-----------------------
The interview questions are roughly structured like the sections in the
\"Basic data engineering skills\" part. This makes it easier to navigate
this document. I still need to sort them accordingly.
### SQL DBs
- What are windowing functions?
- What is a stored procedure?
- Why would you use them?
- What are atomic attributes?
- Explain ACID props of a database
- How to optimize queries?
- What are the different types of JOIN (CROSS, INNER, OUTER)?
- What is the difference between Clustered Index and Non-Clustered
Index - with examples?
### The Cloud
- What is serverless?
- What is the difference between IaaS, PaaS and SaaS?
- How do you move from the ingest layer to the Cosumption layer? (In
Serverless)
- What is edge computing?
- What is the difference between cloud and edge and on-premise?
### Linux
- What is crontab?
### Big Data
- What are the 4 V's?
- Which one is most important?
### Kafka
- What is a topic?
- How to ensure FIFO?
- How do you know if all messages in a topic have been fully consumed?
- What are brokers?
- What are consumergroups?
- What is a producer?
### Coding
- What is the difference between an object and a class?
- Explain immutability
- What are AWS Lambda functions and why would you use them?
- Difference between library, framework and package
- How to reverse a linked list
- Difference between args and kwargs
- Difference between OOP and functional programming
### NoSQL DBs
- What is a key-value (rowstore) store?
- What is a columnstore?
- Diff between Row and col.store
- What is a document store?
- Difference between Redshift and Snowflake
### Hadoop
- What file formats can you use in Hadoop?
- What is the difference between a namenode and a datanode?
- What is HDFS?
- What is the purpose of YARN?
### Lambda Architecture
- What is streaming and batching?
- What is the upside of streaming vs batching?
- What is the difference between lambda and kappa architecture?
- Can you sync the batch and streaming layer and if yes how?
### Data Warehouse & Data Lake
- What is a data lake?
- What is a data warehouse?
- Are there data lake warehouses?
- Two data lakes within single warehouse?
- What is a data mart?
- What is a slow changing dimension (types)?
- What is a surrogate key and why use them?
### APIs (REST)
- What does REST mean?
- What is idempotency?
- What are common REST API frameworks (Jersey and Spring)?
### Apache Spark
- What is an RDD?
- What is a dataframe?
- What is a dataset?
- How is a dataset typesafe?
- What is Parquet?
- What is Avro?
- Difference between Parquet and Avro
- Tumbling Windows vs. Sliding Windows
- Difference between batch and stream processing
- What are microbatches?
### MapReduce
- What is a use case of mapreduce?
- Write a pseudo code for wordcount
- What is a combiner?
### Docker & Kubernetes
- What is a container?
- Difference between Docker Container and a Virtual PC
- What is the easiest way to learn kubernetes fast?
### Data Pipelines
- What is an example of a serverless pipeline?
- What is the difference between at most once vs at least once vs
exactly once?
- What systems provide transactions?
- What is a ETL pipeline?
### Airflow
- What is a DAG (in context of airflow/luigi)?
- What are hooks/is a hook?
- What are operators?
- How to branch?
### DataVisualization
- What is a BI tool?
### Security/Privacy
- What is Kerberos?
- What is a firewall?
- What is GDPR?
- What is anonymization?
### Distributed Systems
- How clusters reach consensus (the answer was using consensus
protocols like Paxos or Raft). Good I didnt have to explain paxos
- What is the cap theorem / explain it (What factors should be
considered when choosing a DB?)
- How to choose right storage for different data consumers? It's
always a tricky question
### Apache Flink
- What is Flink used for?
- Flink vs Spark?
### GitHub
- What are branches?
- What are commits?
- What's a pull request?
### Dev/Ops
- What is continuous integration?
- What is continuous deployment?
- Difference CI/CD
### Development / Agile
- What is Scrum?
- What is OKR?
- What is Jira and what is it used for?
================================================
FILE: sections/09-BooksAndCourses.md
================================================
Recommended Books, Courses, and Podcasts
=============================
## Contents
- [About Books and Courses](09-BooksAndCourses.md#about-books-and-courses)
- [Books](09-BooksAndCourses.md#books)
- [Languages](09-BooksAndCourses.md#books-languages)
- [Data Science Tools](09-BooksAndCourses.md#books-data-science-tools)
- [Business](09-BooksAndCourses.md#Books-Business)
- [Community Recommendations](09-BooksAndCourses.md#Community-Recommendations)
- [Online Courses](09-BooksAndCourses.md#Online-Courses)
- [Preparation courses](09-BooksAndCourses.md#Preparation-courses)
- [Data engineering courses](09-BooksAndCourses.md#Data-engineering-courses)
- [Certifications](09-BooksAndCourses.md#Certifications)
- [Podcasts](09-BooksAndCourses.md#Podcasts)
## About Books, Courses, and Podcasts
This is a collection of books and courses I can recommend personally.
They are great for every data engineering learner.
I either have used or own these books during my professional work.
I also looked into every online course personally.
If you want to buy a book or course and support my work, please use one of my links below. They are all affiliate marketing links that help me fund this passion.
Of course all this comes at no additional expense to you, but it helps me a lot.
You can find even more interesting books and my whole podcast equipment on my Amazon store:
[Go to the Amazon store](https://www.amazon.com/shop/plumbersofdatascience)
PS: Don't just get a book and expect to learn everything
- Course certificates alone help you nothing
- Have a purpose in mind, like a small project
- Great for use at work
## Books
### Languages
#### Java
[Learning Java: A Bestselling Hands-On Java Tutorial](https://amzn.to/2MgYp8h)
#### Python
[Learning Python, 5th Edition](https://amzn.to/2MdpM34)
#### Scala
[Programming Scala: Scalability = Functional Programming + Objects](https://amzn.to/2VIpww5)
#### Swift
[Learning Swift: Building Apps for macOS, iOS, and Beyond](https://amzn.to/31hDN4e)
### Data Science Tools
#### Apache Spark
[Learning Spark: Lightning-Fast Big Data Analysis](https://amzn.to/31mtAUg)
#### Apache Kafka
[Kafka Streams in Action: Real-time apps and microservices with the Kafka Streams API](https://amzn.to/35uiSOJ)
#### Apache Hadoop
[Hadoop: The Definitive Guide: Storage and Analysis at Internet Scale](https://amzn.to/2VNzf4n)
#### Apache HBase
[HBase: The Definitive Guide: Random Access to Your Planet-Size Data](https://amzn.to/2BbiyGz)
### Business
#### The Lean Startup
[The Lean Startup: How Today's Entrepreneurs Use Continuous Innovation to Create Radically Successful Businesses](https://amzn.to/2Meyv5e)
#### Zero to One
[Zero to One: Notes on Startups, or How to Build the Future](https://amzn.to/2BbBwgr)
#### The Innovators Dilemma
[The Innovator's Dilemma: When New Technologies Cause Great Firms to Fail (Management of Innovation and Change)](https://amzn.to/31eGZ0k)
#### Crossing the Chasm
[Crossing the Chasm, 3rd Edition (Collins Business Essentials)](https://amzn.to/2IU7QZs)
#### Crush It!
[Crush It!: Why Now Is The Time To Cash In On Your Passion](https://amzn.to/33xe7Su)
### Community Recommendations
#### Designing Data-Intensive Applications
"In my opinion, the knowledge contained in this book differentiates a data engineer from a software engineer or a developer. The book strikes a good balance between breadth and depth of discussion on data engineering topics, as well as the tradeoffs we must make due to working with massive amounts of data." -- David Lee on LinkedIn
[Designing Data-Intensive Applications: The Big Ideas Behind Reliable, Scalable, and Maintainable Systems](https://amzn.to/2MIqTqJ)
## Online Courses
### Preparation courses
| Course name | Course description | Course URL |
|---|---|---|
| The Bits and Bytes of Computer Networking | This course is designed to provide a full overview of computer networking. We’ll cover everything from the fundamentals of modern networking technologies and protocols to an overview of the cloud to practical applications and network troubleshooting. | https://www.coursera.org/learn/computer-networking |
| Learn SQL \| Codecademy | In this SQL course, you'll learn how to manage large datasets and analyze real data using the standard data management language. | https://www.codecademy.com/learn/learn-sql |
| Learn Python 3 \| Codecademy | Learn the basics of Python 3, one of the most powerful, versatile, and in-demand programming languages today. | https://www.codecademy.com/learn/learn-python-3 |
### Data engineering courses
| Course name | Course description | Course URL |
|---|---|---|
| **1. Data Engineering Basics** | | |
| Introduction to Data Engineering | Introduction to Data Engineering with over 1 hour of videos including my journey here. | https://learndataengineering.com/p/introduction-to-data-engineering |
| Computer Science Fundamentals | A complete guide of topics and resources you should know as a Data Engineer. | https://learndataengineering.com/p/data-engineering-fundamentals |
| Introduction to Python | Learn all the fundamentals of Python to start coding quick | https://learndataengineering.com/p/introduction-to-python |
| Python for Data Engineers | Learn all the Python topics a Data Engineer needs even if you don't have a coding background | https://learndataengineering.com/p/python-for-data-engineers |
| Docker Fundamentals | Learn all the fundamental Docker concepts with hands-on examples | https://learndataengineering.com/p/docker-fundamentals |
| Successful Job Application | Everything you need to get your dream job in Data Engineering. | https://learndataengineering.com/p/successful-job-application |
| Data Preparation & Cleaning for ML | All you need for preparing data to enable Machine Learning. | https://learndataengineering.com/p/data-preparation-and-cleaning-for-ml |
| **2. Platform & Pipeline Design Fundamentals** | | |
| Data Platform And Pipeline Design | Learn how to build data pipelines with templates and examples for Azure, GCP and Hadoop. | https://learndataengineering.com/p/data-pipeline-design |
| Platform & Pipelines Security | Learn the important security fundamentals for Data Engineering | https://learndataengineering.com/p/platform-pipeline-security |
| Choosing Data Stores | Learn the different types of data stores and when to use which. | https://learndataengineering.com/p/choosing-data-stores |
| Schema Design Data Stores | Learn how to design schemas for SQL, NoSQL and Data Warehouses. | https://learndataengineering.com/p/data-modeling |
| **3. Fundamental Tools** | | |
| Building APIs with FastAPI | Learn the fundamentals of designing, creating and deploying APIs with FastAPI and Docker | https://learndataengineering.com/p/apis-with-fastapi-course |
| Apache Kafka Fundamentals | Learn the fundamentals of Apache Kafka | https://learndataengineering.com/p/apache-kafka-fundamentals |
| Apache Spark Fundamentals | Apache Spark quick start course in Python with Jupyter notebooks, DataFrames, SparkSQL and RDDs. | https://learndataengineering.com/p/learning-apache-spark-fundamentals |
| Data Engineering on Databricks | Everything you need to get started with Databricks. From setup to building ETL pipelines & warehousing. | https://learndataengineering.com/p/data-engineering-on-databricks |
| MongoDB Fundamentals | Learn how to use MongoDB | https://learndataengineering.com/p/mongodb-fundamentals-course |
| Log Analysis with Elasticsearch | Learn how to monitor and debug your data pipelines | https://learndataengineering.com/p/log-analysis-with-elasticsearch |
| Airflow Workflow Orchestration | Learn how to orchestrate your data pipelines with Apache Airflow | https://learndataengineering.com/p/learn-apache-airflow |
| Snowflake for Data Engineers | Everything you need to get started with Snowflake | https://learndataengineering.com/p/snowflake-for-data-engineers |
| dbt for Data Engineers | Everything you need to work with dbt and Snowflake | https://learndataengineering.com/p/dbt-for-data-engineers |
| **4. Full Hands-On Example Projects** | | |
| Data Engineering on AWS | Full 5 hours course with complete example project. Building stream and batch processing pipelines on AWS. | https://learndataengineering.com/p/data-engineering-on-aws |
| Data Engineering on Azure | Ingest, Store, Process, Serve and Visualize Streams of Data by Building Streaming Data Pipelines in Azure. | https://learndataengineering.com/p/build-streaming-data-pipelines-in-azure |
| Data Engineering on GCP | Everything you need to start with Google Cloud. | https://learndataengineering.com/p/data-engineering-on-gcp |
| Modern Data Warehouses & Data Lakes | How to integrate a Data Lake with a Data Warehouse and query data directly from files | https://learndataengineering.com/p/modern-data-warehouses |
| Machine Learning & Containerization On AWS | Build a app that analyzes the sentiment of tweets and visualizing them on a user interface hosted as container | https://learndataengineering.com/p/ml-on-aws |
| Contact Tracing with Elasticsearch | Track 100,000 users in San Francisco using Elasticsearch and an interactive Streamlit user interface | https://learndataengineering.com/p/contact-tracing-with-elasticsearch |
| Document Streaming Project | Document Streaming with FastAPI, Kafka, Spark Streaming, MongoDB and Streamlit | https://learndataengineering.com/p/document-streaming |
| Storing & Visualizing Time Series Data with InfluxDB and Grafana | Learn how to use InfluxDB to store time series data and visualize interactive dashboards with Grafana | https://learndataengineering.com/p/time-series-influxdb-grafana |
| Data Engineering with Hadoop | Hadoop Project with HDFS, YARN, MapReduce, Hive and Sqoop! | https://learndataengineering.com/p/data-engineering-with-hadoop |
| Dockerized ETL | Learn how quickly set up a simple ETL script with AWS TDengine & Grafana | https://learndataengineering.com/p/timeseries-etl-with-aws-tdengine-grafana |
## Certifications
Here's a list of great certifications that you can do on AWS and Azure. We left out GCP here, because the adoption of AWS and Azure is a lot higher and that's why I recommend to start with one of these. The costs are usually for doing the certification tests. We also added the level and prerequisites to make it easier for you make the decision which one fits for you.
| Platform | Certification Name | Price | Level | Prerequisite Experience | URL |
|----------|---------------------------------------------------------|-------|-------------|------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------|
| AWS | AWS Certified Cloud Practitioner (maybe) | 100 | Beginner | Familiarity with the AWS platform is recommended but not required. | [Link](https://aws.amazon.com/certification/certified-cloud-practitioner/) |
| AWS | AWS Certified Solutions Architect | 300 | Expert | AWS Certified Solutions Architect - Professional is intended for individuals with two or more years of hands-on experience designing and deploying cloud architecture on AWS. | [Link](https://aws.amazon.com/certification/certified-solutions-architect-professional/?ch=sec&sec=rmg&d=1) |
| AWS | AWS Certified Solutions Architect | 150 | Intermediate| This is an ideal starting point for candidates with AWS Cloud or strong on-premises IT experience. This exam does not require deep hands-on coding experience, although familiarity with basic programming concepts would be an advantage. | [Link](https://aws.amazon.com/certification/certified-solutions-architect-associate/) |
| AWS | AWS Certified Data Engineer | 150 | Intermediate| The ideal candidate for this exam has the equivalent of 2-3 years of experience in data engineering or data architecture and a minimum of 1-2 years of hands-on experience with AWS services. | [Link](https://aws.amazon.com/certification/certified-data-engineer-associate/) |
| Azure | Microsoft Certified: Azure Cosmos DB Developer Specialty| 165 | Intermediate| | [Link](https://learn.microsoft.com/en-us/credentials/certifications/azure-cosmos-db-developer-specialty/?practice-assessment-type=certification) |
| Azure | Microsoft Certified: Azure Data Engineer Associate - DP 203| 165 | Intermediate| | [Link](https://learn.microsoft.com/en-us/credentials/certifications/azure-data-engineer/?practice-assessment-type=certification) |
| Azure | Microsoft Certified: Azure Data Fundamentals | 99 | Beginner | | [Link](https://learn.microsoft.com/en-us/credentials/certifications/azure-data-fundamentals/?practice-assessment-type=certification) |
| Azure | Microsoft Certified: Azure Database Administrator Associate| 165 | Intermediate| | [Link](https://learn.microsoft.com/en-us/credentials/certifications/azure-database-administrator-associate/?practice-assessment-type=certification) |
| Azure | Microsoft Certified: Azure Developer Associate | 165 | Intermediate| | [Link](https://learn.microsoft.com/en-us/credentials/certifications/azure-developer/?practice-assessment-type=certification) |
| Azure | Microsoft Certified: Azure Fundamentals | 99 | Beginner | | [Link](https://learn.microsoft.com/en-us/credentials/certifications/azure-fundamentals/?practice-assessment-type=certification) |
| Azure | Microsoft Certified: Azure Solutions Architect Expert | 165 | Expert | Microsoft Certified: Azure Administrator Associate certification | [Link](https://learn.microsoft.com/en-us/credentials/certifications/azure-solutions-architect/) |
| Azure | Microsoft Certified: Fabric Analytics Engineer Associate| 165 | Intermediate| | [Link](https://learn.microsoft.com/en-us/credentials/certifications/fabric-analytics-engineer-associate/?practice-assessment-type=certification) |
| Azure | Microsoft Certified: Fabric Data Engineer Associate | 165 | Intermediate| | [Link](https://learn.microsoft.com/en-us/credentials/certifications/fabric-data-engineer-associate/) |
| Azure | Microsoft Certified: Power BI Data Analyst Associate | 165 | Intermediate| | [Link](https://learn.microsoft.com/en-us/credentials/certifications/data-analyst-associate/?practice-assessment-type=certification) |
## Podcasts
Top five podcasts by the number of episodes created.
### Super Data Science
[The latest machine learning, A.I., and data career topics from across both academia and industry are brought to you by host Dr. Jon Krohn on the Super Data Science Podcast.](https://podcasts.apple.com/us/podcast/super-data-science/id1163599059)
### Data Skeptic
[The Data Skeptic Podcast features interviews and discussion of topics related to data science, statistics, machine learning, artificial intelligence and the like, all from the perspective of applying critical thinking and the scientific method to evaluate the veracity of claims and efficacy of approaches.](https://podcasts.apple.com/us/podcast/data-skeptic/id890348705)
### Data Engineering Podcast
[This show goes behind the scenes for the tools, techniques, and difficulties associated with the discipline of data engineering. Databases, workflows, automation, and data manipulation are just some of the topics that you will find here.](https://podcasts.apple.com/us/podcast/data-engineering-podcast/id1193040557?mt=2)
### Roaring Elephant BiteSized Big Tech
[A weekly community podcast about Big Technology with a focus on Open Source, Advanced Analytics and other modern magic.](https://roaringelephant.org/)
### SQL Data Partners Podcast
[Hosted by Carlos L Chacon, the SQL Data Partners Podcast focuses on Microsoft data platform related topics mixed with a sprinkling of professional development. Carlos and guests discuss new and familiar features and ideas and how you might apply them in your environments.](https://podcasts.apple.com/us/podcast/sql-data-partners-podcast/id1027394388)
### Complete list
| Host name | Podcast name | Access podcast |
|-------------------------|----------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Jon Krohn | Super Data Science | https://www.superdatascience.com/podcast |
| Kyle Polich | Data Skeptic | https://dataskeptic.com/ |
| Tobias Macey | Data Engineering Podcast | https://www.dataengineeringpodcast.com/ |
| Dave Russell | Roaring Elephant - Bite-Sized Big Tech | https://roaringelephant.org/ |
| Carlos L Chacon | SQL Data Partners Podcast | https://sqldatapartners.com/podcast/ |
| Jason Himmelstein | BIFocal - Clarifying Business Intelligence | https://bifocal.show/ |
| Scott Hirleman | Data Mesh Radio | https://daappod.com/data-mesh-radio/ |
| Jonathan Schwabish | PolicyViz | https://policyviz.com/podcast/ |
| Al Martin | Making Data Simple | https://www.ibm.com/blogs/journey-to-ai/2021/02/making-data-simple-this-week-we-continue-our-discussion-on-data-framework-and-what-is-meant-by-data-framework/ |
| John David Ariansen | How to Get an Analytics Job | https://www.silvertoneanalytics.com/how-to-get-an-analytics-job/ |
| Moritz Stefaner | Data Stories | https://datastori.es/ |
| Hilary Parker | Not So Standard Deviations | https://nssdeviations.com/ |
| Ben Lorica | The Data Exchange with Ben Lorica | https://thedataexchange.media/author/bglorica/ |
| Juan Sequeda | Catalog & Cocktails | https://data.world/resources/podcasts/ |
| Wayne Eckerson | Secrets of Data Analytics Leaders | https://www.eckerson.com/podcasts/secrets-of-data-analytics-leaders |
| Guy Glantser | SQL Server Radio | https://www.sqlserverradio.com/ |
| Eitan Blumin | SQL Server Radio | https://www.sqlserverradio.com/ |
| Jason Tan | The Analytics Show | https://ddalabs.ai/the-analytics-show/ |
| Hugo Bowne-Anderson | DataFramed | https://www.datacamp.com/podcast |
| Kostas Pardalis | The Data Stack Show | https://datastackshow.com/ |
| Eric Dodds | The Data Stack Show | https://datastackshow.com/ |
| Catherine King | The Business of Data Podcast | https://podcasts.apple.com/gb/podcast/the-business-of-data-podcast/id1528796448 |
| | The Business of Data | https://business-of-data.com/podcasts/ |
| James Le | Datacast | https://datacast.simplecast.com/ |
| Mike Delgado | DataTalk | https://podcasts.apple.com/us/podcast/datatalk/id1398548129 |
| Matt Housley | Monday Morning Data Chat | https://podcasts.apple.com/us/podcast/monday-morning-data-chat/id1565154727 |
| Francesco Gadaleta | Data Science at Home | https://datascienceathome.com/ |
| Alli Torban | Data Viz Today | https://dataviztoday.com/ |
| Steve Jones | Voice of the DBA | https://voiceofthedba.com/ |
| Lea Pica | The Present Beyond Measure Show: Data Storytelling, Presentation & Visualization | https://leapica.com/podcast/ |
| Samir Sharma | The Data Strategy Show | https://podcasts.apple.com/us/podcast/the-data-strategy-show/id1515194422 |
| Cindi Howson | The Data Chief | https://www.thoughtspot.com/data-chief/podcast |
| Cole Nussbaumer Knaflic | storytelling with data podcast | https://storytellingwithdata.libsyn.com/ |
| Margot Gerritsen | Women in Data Science | https://www.widsconference.org/podcast.html |
| Jonas Christensen | Leaders of Analytics | https://www.leadersofanalytics.com/episode/the-future-of-analytics-leadership-with-john-thompson |
| Matt Brady | ZUMA: Data For Good | https://www.youtube.com/@zuma-dataforgood |
| Julia Schottenstein | The Analytics Engineering Podcast | https://roundup.getdbt.com/s/the-analytics-engineering-podcast |
| | Data Unlocked | https://dataunlocked.buzzsprout.com/ |
| Boris Jabes | The Sequel Show | https://www.thesequelshow.com/ |
| | Data Radicals | https://www.alation.com/podcast/ |
| Nicola Askham | The Data Governance | https://www.nicolaaskham.com/podcast |
| Boaz Farkash | The Data Engineering Show | https://www.dataengineeringshow.com/ |
| Bob Haffner | The Engineering Side of Data | https://podcasts.apple.com/us/podcast/the-engineering-side-of-data/id1566999533 |
| Dan Linstedt | Data Vault Alliance | https://datavaultalliance.com/category/news/podcasts/ |
| Dustin Schimek | Data Ideas | https://podcasts.apple.com/us/podcast/data-ideas/id1650322207 |
| Alex Merced | The datanation | https://podcasts.apple.com/be/podcast/the-datanation-podcast-podcast-for-data-engineers/id1608638822 |
| Thomas Bustos | Let's Talk AI | https://www.youtube.com/@lets-talk-ai |
| Jahanvee Narang | Decoding Data Analytics | https://www.youtube.com/@decodingdataanalytics/videos |
================================================
FILE: sections/10-Updates.md
================================================
Updates
============
What's new? Here you can find a list of all the updates with links to the sections
- **2025-07-21**
- Added a list of my students favorite datasets and APIs [click here](07-DataSources.md#Student-Favorites)
- **2025-06-11**
- Released the first playable demo of the Spark Optimization Playground [click here](https://bit.ly/play-spark-optimization)
- **2025-03-25**
- Added a detailed 14-week roadmap to Data Engineering for Data Scientists [click here](01-Introduction.md#roadmap-for-data-scientists)
- **2025-03-05**
- Added a detailed 11-week roadmap to Data Engineering for Beginners [click here](01-Introduction.md#roadmap-for-beginners)
- **2025-03-04**
- Added a detailed 10-week roadmap to Data Engineering for Data Analysts [click here](01-Introduction.md#roadmap-for-data-analysts)
- **2024-12-11**
- Prepared the 81 most important questions for platform & pipeline design. Specifically looking at the data source and the goals [click here](03-AdvancedSkills.md#81-platform-and-pipeline-design-questions)
- **2024-11-28**
- Prepared a GenAI RAG example project that you can run on your own computer without internet. It uses Ollama with Mistral model and Elasticsearch. Working on a way of creating embeddings from pdf files and inserting them into Elsaticsearch for queries [click here](04-HandsOnCourse.md#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch)
- **2024-11-23**
- Added an overview of AWS and Azure cloud certifications for Data Engineers. From beginners to experts [click here](09-BooksAndCourses.md#Certifications)
- **2024-07-31**
- Added 10 platform architecture react videos I did to the "Best Practices" section. This way you get a better feeling of what companies are doing and which tools they use [click here](06-BestPracticesCloud.md#best-practices)
- **2024-07-17**
- Added 20 API interview questoins and their answers [click here](08-InterviewQuestions.md#apis)
- Added 10 Python interview questions and their answers [click here](03-AdvancedSkills.md#python)
- **2024-07-08**
- Added large article about Snowflake and dbt for Data Engineers [click here](03-AdvancedSkills.md#analytical-data-stores)
- Added new secton "Analytical Data Stores" to Advanced skills with the Snowflake & dbt infos.
- Put SQL and NoSQL datastores into a new section "Transactional Data Stores"
- **2024-03-20**
- Added roadmap for Software Engineers / Computer Scientists [click here](01-Introduction.md#roadmap-for-software-engineers)
- Added many questions and answers from my interview on the Super Data Science Podcast (plus links to YouTube and the Podcast) [click here](01-Introduction.md#Interview-with-Andreas-on-the-Super-Data-Science-Podcast)
- **2024-03-13**
- Added "How to become a Senior Data Engineer" live stream series as a blog post with images shown in the live streams and the links to the videos. [click here](01-Introduction.md#how-to-become-a-senior-data-engineer)
- **2024-03-08**
- Included Data Engineering skills matrix into the introduction with link to the live stream. [click here](01-Introduction.md#data-engineers-skills-matrix)
- **2024-03-01**
- Added updates section
- Reworked the Hands-on courses section with 5 free courses / tutorials from Andreas on YouTube [click here](04-HandsOnCourse.md)
- **2024-02-28**
- Added Data Engineering Roadmap for Data Scientists: [click here](01-Introduction.md#roadmap-for-data-scientists)
- **2024-02-25**
- Data Engineering Roadmap for Software Engineers: [click here](01-Introduction.md#roadmap-for-software-engineers)
- **2024-02-20**
- Data Engineering Roadmap for Data Analysts: [click here](01-Introduction.md#roadmap-for-data-analysts)