newBuilder().
*/
public RideRecord() {}
/**
* All-args constructor.
* @param vendor_id The new value for vendor_id
* @param passenger_count The new value for passenger_count
* @param trip_distance The new value for trip_distance
*/
public RideRecord(java.lang.String vendor_id, java.lang.Integer passenger_count, java.lang.Double trip_distance) {
this.vendor_id = vendor_id;
this.passenger_count = passenger_count;
this.trip_distance = trip_distance;
}
@Override
public org.apache.avro.specific.SpecificData getSpecificData() { return MODEL$; }
@Override
public org.apache.avro.Schema getSchema() { return SCHEMA$; }
// Used by DatumWriter. Applications should not call.
@Override
public java.lang.Object get(int field$) {
switch (field$) {
case 0: return vendor_id;
case 1: return passenger_count;
case 2: return trip_distance;
default: throw new IndexOutOfBoundsException("Invalid index: " + field$);
}
}
// Used by DatumReader. Applications should not call.
@Override
@SuppressWarnings(value="unchecked")
public void put(int field$, java.lang.Object value$) {
switch (field$) {
case 0: vendor_id = value$ != null ? value$.toString() : null; break;
case 1: passenger_count = (java.lang.Integer)value$; break;
case 2: trip_distance = (java.lang.Double)value$; break;
default: throw new IndexOutOfBoundsException("Invalid index: " + field$);
}
}
/**
* Gets the value of the 'vendor_id' field.
* @return The value of the 'vendor_id' field.
*/
public java.lang.String getVendorId() {
return vendor_id;
}
/**
* Sets the value of the 'vendor_id' field.
* @param value the value to set.
*/
public void setVendorId(java.lang.String value) {
this.vendor_id = value;
}
/**
* Gets the value of the 'passenger_count' field.
* @return The value of the 'passenger_count' field.
*/
public int getPassengerCount() {
return passenger_count;
}
/**
* Sets the value of the 'passenger_count' field.
* @param value the value to set.
*/
public void setPassengerCount(int value) {
this.passenger_count = value;
}
/**
* Gets the value of the 'trip_distance' field.
* @return The value of the 'trip_distance' field.
*/
public double getTripDistance() {
return trip_distance;
}
/**
* Sets the value of the 'trip_distance' field.
* @param value the value to set.
*/
public void setTripDistance(double value) {
this.trip_distance = value;
}
/**
* Creates a new RideRecord RecordBuilder.
* @return A new RideRecord RecordBuilder
*/
public static schemaregistry.RideRecord.Builder newBuilder() {
return new schemaregistry.RideRecord.Builder();
}
/**
* Creates a new RideRecord RecordBuilder by copying an existing Builder.
* @param other The existing builder to copy.
* @return A new RideRecord RecordBuilder
*/
public static schemaregistry.RideRecord.Builder newBuilder(schemaregistry.RideRecord.Builder other) {
if (other == null) {
return new schemaregistry.RideRecord.Builder();
} else {
return new schemaregistry.RideRecord.Builder(other);
}
}
/**
* Creates a new RideRecord RecordBuilder by copying an existing RideRecord instance.
* @param other The existing instance to copy.
* @return A new RideRecord RecordBuilder
*/
public static schemaregistry.RideRecord.Builder newBuilder(schemaregistry.RideRecord other) {
if (other == null) {
return new schemaregistry.RideRecord.Builder();
} else {
return new schemaregistry.RideRecord.Builder(other);
}
}
/**
* RecordBuilder for RideRecord instances.
*/
@org.apache.avro.specific.AvroGenerated
public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBasenewBuilder().
*/
public RideRecordCompatible() {}
/**
* All-args constructor.
* @param vendorId The new value for vendorId
* @param passenger_count The new value for passenger_count
* @param trip_distance The new value for trip_distance
* @param pu_location_id The new value for pu_location_id
*/
public RideRecordCompatible(java.lang.String vendorId, java.lang.Integer passenger_count, java.lang.Double trip_distance, java.lang.Long pu_location_id) {
this.vendorId = vendorId;
this.passenger_count = passenger_count;
this.trip_distance = trip_distance;
this.pu_location_id = pu_location_id;
}
@Override
public org.apache.avro.specific.SpecificData getSpecificData() { return MODEL$; }
@Override
public org.apache.avro.Schema getSchema() { return SCHEMA$; }
// Used by DatumWriter. Applications should not call.
@Override
public java.lang.Object get(int field$) {
switch (field$) {
case 0: return vendorId;
case 1: return passenger_count;
case 2: return trip_distance;
case 3: return pu_location_id;
default: throw new IndexOutOfBoundsException("Invalid index: " + field$);
}
}
// Used by DatumReader. Applications should not call.
@Override
@SuppressWarnings(value="unchecked")
public void put(int field$, java.lang.Object value$) {
switch (field$) {
case 0: vendorId = value$ != null ? value$.toString() : null; break;
case 1: passenger_count = (java.lang.Integer)value$; break;
case 2: trip_distance = (java.lang.Double)value$; break;
case 3: pu_location_id = (java.lang.Long)value$; break;
default: throw new IndexOutOfBoundsException("Invalid index: " + field$);
}
}
/**
* Gets the value of the 'vendorId' field.
* @return The value of the 'vendorId' field.
*/
public java.lang.String getVendorId() {
return vendorId;
}
/**
* Sets the value of the 'vendorId' field.
* @param value the value to set.
*/
public void setVendorId(java.lang.String value) {
this.vendorId = value;
}
/**
* Gets the value of the 'passenger_count' field.
* @return The value of the 'passenger_count' field.
*/
public int getPassengerCount() {
return passenger_count;
}
/**
* Sets the value of the 'passenger_count' field.
* @param value the value to set.
*/
public void setPassengerCount(int value) {
this.passenger_count = value;
}
/**
* Gets the value of the 'trip_distance' field.
* @return The value of the 'trip_distance' field.
*/
public double getTripDistance() {
return trip_distance;
}
/**
* Sets the value of the 'trip_distance' field.
* @param value the value to set.
*/
public void setTripDistance(double value) {
this.trip_distance = value;
}
/**
* Gets the value of the 'pu_location_id' field.
* @return The value of the 'pu_location_id' field.
*/
public java.lang.Long getPuLocationId() {
return pu_location_id;
}
/**
* Sets the value of the 'pu_location_id' field.
* @param value the value to set.
*/
public void setPuLocationId(java.lang.Long value) {
this.pu_location_id = value;
}
/**
* Creates a new RideRecordCompatible RecordBuilder.
* @return A new RideRecordCompatible RecordBuilder
*/
public static schemaregistry.RideRecordCompatible.Builder newBuilder() {
return new schemaregistry.RideRecordCompatible.Builder();
}
/**
* Creates a new RideRecordCompatible RecordBuilder by copying an existing Builder.
* @param other The existing builder to copy.
* @return A new RideRecordCompatible RecordBuilder
*/
public static schemaregistry.RideRecordCompatible.Builder newBuilder(schemaregistry.RideRecordCompatible.Builder other) {
if (other == null) {
return new schemaregistry.RideRecordCompatible.Builder();
} else {
return new schemaregistry.RideRecordCompatible.Builder(other);
}
}
/**
* Creates a new RideRecordCompatible RecordBuilder by copying an existing RideRecordCompatible instance.
* @param other The existing instance to copy.
* @return A new RideRecordCompatible RecordBuilder
*/
public static schemaregistry.RideRecordCompatible.Builder newBuilder(schemaregistry.RideRecordCompatible other) {
if (other == null) {
return new schemaregistry.RideRecordCompatible.Builder();
} else {
return new schemaregistry.RideRecordCompatible.Builder(other);
}
}
/**
* RecordBuilder for RideRecordCompatible instances.
*/
@org.apache.avro.specific.AvroGenerated
public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBasenewBuilder().
*/
public RideRecordNoneCompatible() {}
/**
* All-args constructor.
* @param vendorId The new value for vendorId
* @param passenger_count The new value for passenger_count
* @param trip_distance The new value for trip_distance
*/
public RideRecordNoneCompatible(java.lang.Integer vendorId, java.lang.Integer passenger_count, java.lang.Double trip_distance) {
this.vendorId = vendorId;
this.passenger_count = passenger_count;
this.trip_distance = trip_distance;
}
@Override
public org.apache.avro.specific.SpecificData getSpecificData() { return MODEL$; }
@Override
public org.apache.avro.Schema getSchema() { return SCHEMA$; }
// Used by DatumWriter. Applications should not call.
@Override
public java.lang.Object get(int field$) {
switch (field$) {
case 0: return vendorId;
case 1: return passenger_count;
case 2: return trip_distance;
default: throw new IndexOutOfBoundsException("Invalid index: " + field$);
}
}
// Used by DatumReader. Applications should not call.
@Override
@SuppressWarnings(value="unchecked")
public void put(int field$, java.lang.Object value$) {
switch (field$) {
case 0: vendorId = (java.lang.Integer)value$; break;
case 1: passenger_count = (java.lang.Integer)value$; break;
case 2: trip_distance = (java.lang.Double)value$; break;
default: throw new IndexOutOfBoundsException("Invalid index: " + field$);
}
}
/**
* Gets the value of the 'vendorId' field.
* @return The value of the 'vendorId' field.
*/
public int getVendorId() {
return vendorId;
}
/**
* Sets the value of the 'vendorId' field.
* @param value the value to set.
*/
public void setVendorId(int value) {
this.vendorId = value;
}
/**
* Gets the value of the 'passenger_count' field.
* @return The value of the 'passenger_count' field.
*/
public int getPassengerCount() {
return passenger_count;
}
/**
* Sets the value of the 'passenger_count' field.
* @param value the value to set.
*/
public void setPassengerCount(int value) {
this.passenger_count = value;
}
/**
* Gets the value of the 'trip_distance' field.
* @return The value of the 'trip_distance' field.
*/
public double getTripDistance() {
return trip_distance;
}
/**
* Sets the value of the 'trip_distance' field.
* @param value the value to set.
*/
public void setTripDistance(double value) {
this.trip_distance = value;
}
/**
* Creates a new RideRecordNoneCompatible RecordBuilder.
* @return A new RideRecordNoneCompatible RecordBuilder
*/
public static schemaregistry.RideRecordNoneCompatible.Builder newBuilder() {
return new schemaregistry.RideRecordNoneCompatible.Builder();
}
/**
* Creates a new RideRecordNoneCompatible RecordBuilder by copying an existing Builder.
* @param other The existing builder to copy.
* @return A new RideRecordNoneCompatible RecordBuilder
*/
public static schemaregistry.RideRecordNoneCompatible.Builder newBuilder(schemaregistry.RideRecordNoneCompatible.Builder other) {
if (other == null) {
return new schemaregistry.RideRecordNoneCompatible.Builder();
} else {
return new schemaregistry.RideRecordNoneCompatible.Builder(other);
}
}
/**
* Creates a new RideRecordNoneCompatible RecordBuilder by copying an existing RideRecordNoneCompatible instance.
* @param other The existing instance to copy.
* @return A new RideRecordNoneCompatible RecordBuilder
*/
public static schemaregistry.RideRecordNoneCompatible.Builder newBuilder(schemaregistry.RideRecordNoneCompatible other) {
if (other == null) {
return new schemaregistry.RideRecordNoneCompatible.Builder();
} else {
return new schemaregistry.RideRecordNoneCompatible.Builder(other);
}
}
/**
* RecordBuilder for RideRecordNoneCompatible instances.
*/
@org.apache.avro.specific.AvroGenerated
public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase
Master the fundamentals of data engineering by building an end-to-end data pipeline from scratch. Gain hands-on experience with industry-standard tools and best practices.
Join Slack • #course-data-engineering Channel • Telegram Announcements • Course Playlist • FAQ
## How to Enroll ### 2026 Cohort - **Start Date**: 12 January 2026 - **Register Here**: [Sign up](https://airtable.com/shr6oVXeQvSI5HuWD) ### Self-Paced Learning All course materials are freely available for independent study. Follow these steps: 1. Watch the course videos. 2. Join the [Slack community](https://datatalks.club/slack.html). 3. Refer to the [FAQ document](https://datatalks.club/faq/data-engineering-zoomcamp.html) for guidance. ## Syllabus Overview The course consists of structured modules, hands-on workshops, and a final project to reinforce your learning. ### **Prerequisites** To get the most out of this course, you should have: - Basic coding experience - Familiarity with SQL - Experience with Python (helpful but not required) No prior data engineering experience is necessary. ### **Modules** #### [Module 1: Containerization and Infrastructure as Code](01-docker-terraform/) - Introduction to GCP - Docker and Docker Compose - Running PostgreSQL with Docker - Infrastructure setup with Terraform - Homework #### [Module 2: Workflow Orchestration](02-workflow-orchestration/) - Data Lakes and Workflow Orchestration - Workflow orchestration with Kestra - Homework #### [Workshop 1: Data Ingestion](cohorts/2026/workshops/dlt.md) - API reading and pipeline scalability - Data normalization and incremental loading - Homework #### [Module 3: Data Warehousing](03-data-warehouse/) - Introduction to BigQuery - Partitioning, clustering, and best practices - Machine learning in BigQuery #### [Module 4: Analytics Engineering](04-analytics-engineering/) - Analytics Engineering and Data Modeling - dbt (data build tool) with DuckDB & BigQuery - Testing, documentation, and deployment #### [Module 5: Data Platforms](05-data-platforms/) - Building end-to-end data pipelines with Bruin - Data ingestion, transformation, and quality - Deployment to cloud (BigQuery) #### [Module 6: Batch Processing](06-batch/) - Introduction to Apache Spark - DataFrames and SQL - Internals of GroupBy and Joins #### [Module 7: Streaming](07-streaming/) - Introduction to Kafka - Kafka Streams and KSQL - Schema management with Avro #### [Final Project](projects/) - Apply all concepts learned in a real-world scenario - Peer review and feedback process ## Testimonials > Thank you for what you do! The Data Engineering Zoomcamp gave me skills that helped me land my first tech job. > > — [Tim Claytor](https://www.linkedin.com/in/claytor/) ([Source](https://www.linkedin.com/feed/update/urn:li:activity:7396882073308938240?commentUrn=urn%3Ali%3Acomment%3A%28activity%3A7396882073308938240%2C7396889959711793152%29&dashCommentUrn=urn%3Ali%3Afsd_comment%3A%287396889959711793152%2Curn%3Ali%3Aactivity%3A7396882073308938240%29)) > Three months might seem like a long time, but the growth and learning during this period are truly remarkable. It was a great experience with a lot of learning, connecting with like-minded people from all around the world, and having fun. I must admit, this was really hard. But the feeling of accomplishment and learning made it all worthwhile. And I would do it again! > > — [Nevenka Lukic](https://www.linkedin.com/in/nevenka-lukic/) ([Source](https://www.linkedin.com/posts/nevenka-lukic_data-engineering-zoomcamp-final-project-activity-7181985646033461248-Lc1O?utm_source=share&utm_medium=member_desktop&rcm=ACoAADJu9vMBW6iyIYswCQnN6t8UJLkXH2tQPi4)) > One of the significant things I inferred from the Zoomcamp is to prioritize fundamentals and principles over ever-evolving tools and tech stacks. Hugely grateful to Alexey Grigorev for putting together this incredible course and offering it for free. > > — [Siddhartha Gogoi](https://www.linkedin.com/in/siddhartha-gogoi/) ([Source](https://www.linkedin.com/posts/activity-7325692407675604992-XSKI?utm_source=share&utm_medium=member_desktop&rcm=ACoAADJu9vMBW6iyIYswCQnN6t8UJLkXH2tQPi4)) > Such a fun deep dive into data engineering, cloud automation, and orchestration. I learned so much along the way. Big shoutout to Alexey Grigorev and the DataTalksClub team for the opportunity and guidance throughout the 3 months of the free course. > > — [Assitan NIARE](https://www.linkedin.com/in/assitan-niar%C3%A9-data/) ([Source](https://www.linkedin.com/posts/activity-7317441554023874561-E3wm?utm_source=share&utm_medium=member_desktop&rcm=ACoAADJu9vMBW6iyIYswCQnN6t8UJLkXH2tQPi4)) > If you’re serious about breaking into data engineering, start here. The repo’s structure, community, and hands-on focus make it unparalleled. > > — [Wady Osama](https://www.linkedin.com/in/wadyosama/) ([Source](https://www.linkedin.com/posts/wadyosama_dataengineering-zoomcamp-dezoomcamp-activity-7292126824711520258-puJm?utm_source=share&utm_medium=member_desktop&rcm=ACoAADJu9vMBW6iyIYswCQnN6t8UJLkXH2tQPi4)) ## Community & Support ### **Getting Help on Slack** Join the [`#course-data-engineering`](https://app.slack.com/client/T01ATQK62F8/C01FABYF2RG) channel on [DataTalks.Club Slack](https://datatalks.club/slack.html) for discussions, troubleshooting, and networking. To keep discussions organized: - Follow [our guidelines](asking-questions.md) when posting questions. - Review the [community guidelines](https://datatalks.club/slack/guidelines.html). ## Meet the Instructors - [Alexey Grigorev](https://linkedin.com/in/agrigorev) - [Michael Shoemaker](https://www.linkedin.com/in/michaelshoemaker1/) - [Will Russell](https://www.linkedin.com/in/wrussell1999/) - [Anna Geller](https://www.linkedin.com/in/anna-geller-12a86811a/) - [Juan Manuel Perafan](https://www.linkedin.com/in/jmperafan/) - [Arsalan Noorafkan](https://www.linkedin.com/in/arsalan0/) Past instructors: - [Victoria Perez Mola](https://www.linkedin.com/in/victoriaperezmola/) - [Ankush Khanna](https://linkedin.com/in/ankushkhanna2) - [Sejal Vaidya](https://www.linkedin.com/in/vaidyasejal/) - [Irem Erturk](https://www.linkedin.com/in/iremerturk/) - [Luis Oliveira](https://www.linkedin.com/in/lgsoliveira/) - [Zach Wilson](https://www.linkedin.com/in/eczachly) ## Sponsors & Supporters A special thanks to our course sponsors for making this initiative possible! Interested in supporting our community? Reach out to [alexey@datatalks.club](mailto:alexey@datatalks.club). ## About DataTalks.Club
DataTalks.Club is a global online community of data enthusiasts. It's a place to discuss data, learn, share knowledge, ask and answer questions, and support each other.
Website • Join Slack Community • Newsletter • Upcoming Events • YouTube • GitHub • LinkedIn • Twitter
All the activity at DataTalks.Club mainly happens on [Slack](https://datatalks.club/slack.html). We post updates there and discuss different aspects of data, career questions, and more. At DataTalksClub, we organize online events, community activities, and free courses. You can learn more about what we do at [DataTalksClub Community Navigation](https://www.notion.so/DataTalksClub-Community-Navigation-bf070ad27ba44bf6bbc9222082f0e5a8?pvs=21). ================================================ FILE: after-sign-up.md ================================================ ## Thank you! Thanks for signing up for the course. The process of adding you to the mailing list is not automated yet, but you will hear from us closer to the course start. To make sure you don't miss any announcements - Register in [DataTalks.Club's Slack](https://datatalks.club/slack.html) and join the [`#course-data-engineering`](https://app.slack.com/client/T01ATQK62F8/C01FABYF2RG) channel - Join the [course Telegram channel with announcements](https://t.me/dezoomcamp) - Subscribe to [DataTalks.Club's YouTube channel](https://www.youtube.com/c/DataTalksClub) and check [the course playlist](https://www.youtube.com/playlist?list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb) See you in January! ================================================ FILE: asking-questions.md ================================================ ## Asking questions If you have any questions, ask them in the [`#course-data-engineering`](https://app.slack.com/client/T01ATQK62F8/C01FABYF2RG) channel in [DataTalks.Club](https://datatalks.club) slack. To keep our discussion in Slack more organized, we ask you to follow these suggestions: * First, review How to troubleshoot issues listed below. * Before asking a question, check the [FAQ](https://datatalks.club/faq/data-engineering-zoomcamp.html). * Before asking a question review the [Slack Guidelines](#Ask-in-Slack). * If somebody helped you with your problem and it's not in [FAQ](https://datatalks.club/faq/data-engineering-zoomcamp.html), please add it there. It'll help other students. * Zed Shaw (of the Learn the Hard Way series) has [a great post on how to help others help you](https://learncodethehardway.com/blog/03-how-to-ask-for-help/) * Check [Stackoverflow guide on asking](https://stackoverflow.com/help/how-to-ask) ### How to troubleshoot issues The first step is to try to solve the issue on you own; get used to solving problems. This will be a real life skill you need when employed. 1. What does the error say? There will often be a description of the error or instructions on what is needed, I have even seen a link to the solution. Does it reference a specific line of your code? 2. Restart the application or server/pc. 3. Google it. It is going to be rare that you are the first to have the problem, someone out there has posted the issue and likely the solution. Search using: **technology** **problem statement**. Example: `pgcli error column c.relhasoids does not exist`. * There are often different solutions for the same problem due to variation in environments. 4. Check the tech’s documentation. Use its search if available or use the browser's search function. 5. Try uninstall (this may remove the bad actor) and reinstall of application or re-implementation of action. Don’t forget to restart the server/pc for reinstalls. * Sometimes reinstalling fails to resolve the issue but works if you uninstall first. 6. Ask in Slack 7. Take a break and come back to it later. You will be amazed at how often you figure out the solution after letting your brain rest. Get some fresh air, workout, play a video game, watch a tv show, whatever allows your brain to not think about it for a little while or even until the next day. 8. Remember technology issues in real life sometimes take days or even weeks to resolve ### Asking in Slack * Before asking a question, check the [FAQ](https://datatalks.club/faq/data-engineering-zoomcamp.html). * DO NOT use screenshots, especially don’t take pictures from a phone. * DO NOT tag instructors, it may discourage others from helping you. * Copy and paste errors; if it’s long, just post it in a reply to your thread. * Use ``` for formatting your code. * Use the same thread for the conversation (that means replying to your own thread). * DO NOT create multiple posts to discuss the issue. * You may create a new post if the issue reemerges down the road. Be sure to describe what has changed in the environment. * Provide additional information in the same thread of the steps you have taken for resolution. ================================================ FILE: awesome-data-engineering.md ================================================ Have you found any cool resources about data engineering? Put them here ## Learning Data Engineering ### Courses * [Data Engineering Zoomcamp](https://github.com/DataTalksClub/data-engineering-zoomcamp) by DataTalks.Club (free) * [Big Data Platforms, Autumn 2022: Introduction to Big Data Processing Frameworks](https://big-data-platforms-22.mooc.fi/) by the University of Helsinki (free) * [Awesome Data Engineering Learning Path](https://awesomedataengineering.com/) ### Books * [Designing Data-Intensive Applications: The Big Ideas Behind Reliable, Scalable, and Maintainable Systems by Martin Kleppmann](https://www.amazon.com/Designing-Data-Intensive-Applications-Reliable-Maintainable/dp/1449373321) * [Big Data: Principles and Best Practices of Scalable Realtime Data Systems by Nathan Marz, James Warren](https://www.amazon.com/Big-Data-Principles-practices-scalable/dp/1617290343) * [Practical DataOps: Delivering Agile Data Science at Scale by Harvinder Atwal](https://www.amazon.com/Practical-DataOps-Delivering-Agile-Science/dp/1484251032) * [Data Pipelines Pocket Reference: Moving and Processing Data for Analytics by James Densmore](https://www.amazon.com/Data-Pipelines-Pocket-Reference-Processing/dp/1492087831) * [Best books for data engineering](https://awesomedataengineering.com/data_engineering_best_books) * [Fundamentals of Data Engineering: Plan and Build Robust Data Systems by Joe Reis, Matt Housley](https://www.amazon.com/Fundamentals-Data-Engineering-Robust-Systems/dp/1098108302) ### Introduction to Data Engineering Terms * [https://datatalks.club/podcast/s05e02-data-engineering-acronyms.html](https://datatalks.club/podcast/s05e02-data-engineering-acronyms.html) ### Data engineering in practice Conference talks from companies, blog posts, etc * [Uber Data Archives](https://eng.uber.com/category/articles/uberdata/) (Uber engineering blog) * [Data Engineering Weekly (DE-focused substack)](https://www.dataengineeringweekly.com/) * [Seattle Data Guy (DE-focused substack)](https://seattledataguy.substack.com/) ## Doing Data Engineering ### Coding & Python * [CS50's Introduction to Computer Science | edX](https://www.edx.org/course/introduction-computer-science-harvardx-cs50x) (course) * [Python for Everybody Specialization](https://www.coursera.org/specializations/python) (course) * [Practical Python programming](https://github.com/dabeaz-course/practical-python/blob/master/Notes/Contents.md) ### SQL * [Intro to SQL: Querying and managing data | Khan Academy](https://www.khanacademy.org/computing/computer-programming/sql) * [Mode SQL Tutorial](https://mode.com/sql-tutorial/) * [Use The Index, Luke](https://use-the-index-luke.com/) (SQL Indexing a nd Tuning e-Book)nfreffx * [SQL Performance Explained](https://sql-performance-explained.com/) (book) e ### Workflow orchestration * [What is DAG?](https://youtu.be/1Yh5S-S6wsI) (video) * [Airflow, Prefect, and Dagster: An Inside Look](https://towardsdatascience.com/airflow-prefect-and-dagster-an-inside-look-6074781c9b77) (blog post) * [Open-Source Spotlight - Prefect - Kevin Kho](https://www.youtube.com/watch?v=ISLV9JyqF1w) (video) * [Prefect as a Data Engineering Project Workflow Tool, with Mary Clair Thompson (Duke) - 11/6/2020](https://youtu.be/HuwA4wLQtCM) (video) ### ETL and ELT * [ETL vs. ELT: What’s the Difference?](https://rivery.io/blog/etl-vs-elt/) (blog post) (print version) ### Data lakes * [An Introduction to Modern Data Lake Storage Layers (Hodi, Iceberg, Delta Lake)](https://dacort.dev/posts/modern-data-lake-storage-layers/) (blog post) * [Lake House Architecture @ Halodoc: Data Platform 2.0](https://blogs.halodoc.io/lake-house-architecture-halodoc-data-platform-2-0/amp/) (blzog post) ### Data warehousing * [Guide to Data Warehousing. Short and comprehensive information… | by Tomas Peluritis](https://medium.com/towards-data-science/guide-to-data-warehousing-6fdcf30b6fbe) (blog post) * [Snowflake, Redshift, BigQuery, and Others: Cloud Data Warehouse Tools Compared](https://www.altexsoft.com/blog/snowflake-redshift-bigquery-data-warehouse-tools/) (blog post) ### Streaming * Building Streaming Analytics: The Journey and Learnings - Maxim Lukichev ### DataOps * [DataOps 101 with Lars Albertsson – DataTalks.Club](https://datatalks.club/podcast/s02e11-dataops.html) (podcast) * ### Monitoring and observability * [Data Observability: The Next Frontier of Data Engineering with Barr Moses](https://datatalks.club/podcast/s03e03-data-observability.html) (podcast) ### Analytics engineering * [Analytics Engineer: New Role in a Data Team with Victoria Perez Mola](https://datatalks.club/podcast/s03e11-analytics-engineer.html) (podcast) * [Modern Data Stack for Analytics Engineering - Kyle Shannon](https://www.youtube.com/watch?v=UmIZIkeOfi0) (video) * [Analytics Engineering vs Data Engineering | RudderStack Blog](https://www.rudderstack.com/blog/analytics-engineering-vs-data-engineering) (blog post) * [Learn the Fundamentals of Analytics Engineering with dbt](https://courses.getdbt.com/courses/fundamentals) (course) ### Data mesh * [Data Mesh in Practice - Max Schultze](https://www.youtube.com/watch?v=ekEc8D_D3zY) (video) ### Cloud * [https://acceldataio.medium.com/data-engineering-best-practices-how-netflix-keeps-its-data-infrastructure-cost-effective-dee310bcc910](https://acceldataio.medium.com/data-engineering-best-practices-how-netflix-keeps-its-data-infrastructure-cost-effective-dee310bcc910) ### Reverse ETL * TODO: What is reverse ETL? * [https://datatalks.club/podcast/s05e02-data-engineering-acronyms.html](https://datatalks.club/podcast/s05e02-data-engineering-acronyms.html) * [Open-Source Spotlight - Grouparoo - Brian Leonard](https://www.youtube.com/watch?v=hswlcgQZYuw) (video) * [Open-Source Spotlight - Castled.io (Reverse ETL) - Arun Thulasidharan](https://www.youtube.com/watch?v=iW0XhltAUJ8) (video) ## Career in Data Engineering * [From Data Science to Data Engineering with Ellen König – DataTalks.Club](https://datatalks.club/podcast/s07e08-from-data-science-to-data-engineering.html) (podcast) * [Big Data Engineer vs Data Scientist with Roksolana Diachuk – DataTalks.Club](https://datatalks.club/podcast/s04e03-big-data-engineer-vs-data-scientist.html) (podcast) * [What Skills Do You Need to Become a Data Engineer](https://www.linkedin.com/pulse/what-skills-do-you-need-become-data-engineer-peng-wang/) (blog post) * [The future history of Data Engineering](https://groupby1.substack.com/p/data-engineering?s=r) (blog post) * [What Skills Do Data Engineers Need](https://www.theseattledataguy.com/what-skills-do-data-engineers-need/) (blog post) ### Data Engineering Management * [Becoming a Data Engineering Manager with Rahul Jain – DataTalks.Club](https://datatalks.club/podcast/s07e07-becoming-a-data-engineering-manager.html) (podcast) ## Data engineering projects * [How To Start A Data Engineering Project - With Data Engineering Project Ideas](https://www.youtube.com/watch?v=WpN47Jddo7I) (video) * [Data Engineering Project for Beginners - Batch edition](https://www.startdataengineering.com/post/data-engineering-project-for-beginners-batch-edition/) (blog post) * [Building a Data Engineering Project in 20 Minutes](https://www.sspaeti.com/blog/data-engineering-project-in-twenty-minutes/) (blog post) * [Automating Nike Run Club Data Analysis with Python, Airflow and Google Data Studio | by Rich Martin | Medium](https://medium.com/@rich_23525/automating-nike-run-club-data-analysis-with-python-airflow-and-google-data-studio-3c9556478926) (blog post) ## Data Engineering Resources ### Blogs * [Start Data Engineering](https://www.startdataengineering.com/) ### Podcasts * [The Data Engineering Podcast](https://www.dataengineeringpodcast.com/) * [DataTalks.Club Podcast](https://datatalks.club/podcast.html) (only some episodes are about data engineering) * ### Communities * [DataTalks.Club](https://datatalks.club/) * [/r/dataengineering](https://www.reddit.com/r/dataengineering) ### Meetups * [Sydney Data Engineers](https://sydneydataengineers.github.io/) ### People to follow on Twitter and LinkedIn * TODO ### YouTube channels * [Karolina Sowinska - YouTube](https://www.youtube.com/channel/UCAxnMry1lETl47xQWABvH7g) x` * [Seattle Data Guy - YouTube](https://www.youtube.com/c/SeattleDataGuy) * [Andreas Kretz - YouTube](https://www.youtube.com/c/andreaskayy) * [DataTalksClub - YouTube](https://youtube.com/c/datatalksclub) (only some videos are about data engineering) ### Resource aggregators * [Reading List](https://www.scling.com/reading-list/) by Lars Albertsson * [GitHub - igorbarinov/awesome-data-engineering](https://github.com/igorbarinov/awesome-data-engineering) (focus is more on tools) * [GitHub - DataExpert-io/data-engineer-handbook](https://github.com/DataExpert-io/data-engineer-handbook) (contains tools,blogs and more) ## License This work is licensed under a Creative Commons Attribution 4.0 International License. CC BY 4.0 ================================================ FILE: certificates.md ================================================ ## Getting your certificate Congratulations on finishing the course! You can find your certificate in your enrollment profile (you need to be logged in): * For the 2025 edition, it's https://courses.datatalks.club/de-zoomcamp-2025/enrollment If you can't find a certificate in your profile, it means you didn't pass the project. If you believe it's a mistake, write in the course channel in Slack. ## Adding to LinkedIn You can add your certificate to LinkedIn: * Log in to your LinkedIn account, then go to your profile. * On the right, in the "Add profile" section dropdown, choose "Background" and then select the drop-down triangle next to "Licenses & Certifications". * In "Name", enter "Data Engineering Zoomcamp". * In "Issuing Organization", enter "DataTalksClub". * (Optional) In "Issue Date", enter the time when the certificate was created. * (Optional) Select the checkbox This certification does not expire. * Put your certificate ID. * In "Certification URL", enter the URL for your certificate. [Adapted from here](https://support.edx.org/hc/en-us/articles/206501938-How-can-I-add-my-certificate-to-my-LinkedIn-profile-) ================================================ FILE: cohorts/2022/README.md ================================================ ### 2022 Cohort * **Start**: 17 January 2022 * **Registration link**: https://airtable.com/shr6oVXeQvSI5HuWD * [Leaderboard](https://docs.google.com/spreadsheets/d/e/2PACX-1vR9oQiYnAVvzL4dagnhvp0sngqagF0AceD0FGjhS-dnzMTBzNQIal3-hOgkTibVQvfuqbQ69b0fvRnf/pubhtml) ================================================ FILE: cohorts/2022/project.md ================================================ ## Course Project The goal of this project is to apply everything we learned in this course and build an end-to-end data pipeline. Remember that to pass the project, you must evaluate 3 peers. If you don't do that, your project can't be considered compelete. ### Submitting #### Project Cohort #2 Project: * Form: https://forms.gle/JECXB9jYQ1vBXbsw6 * Deadline: 2 May, 22:00 CET Peer reviewing: * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vShnv8T4iY_5NA8h0nySIS8Wzr-DZGGigEikIW4ZMSi9HlvhaEB4RhwmepVIuIUGaQHS90r5iHR2YXV/pubhtml?gid=964123374&single=true) * Form: https://forms.gle/Pb2fBwYLQ3GGFsaK6 * Deadline: 9 May, 22:00 CET #### Project Cohort #1 Project: * Form: https://forms.gle/6aeVcEVJipqR2BqC8 * Deadline: 4 April, 22:00 CET Peer reviewing: * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vShnv8T4iY_5NA8h0nySIS8Wzr-DZGGigEikIW4ZMSi9HlvhaEB4RhwmepVIuIUGaQHS90r5iHR2YXV/pubhtml) * Form: https://forms.gle/AZ62bXMp4SGcVUmK7 * Deadline: 11 April, 22:00 CET Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vRcVCkO-jes5mbPAcikn9X_s2laJ1KhsO8aibHYQxxKqdCUYMVTEJLJQdM8C5aAUWKFl_0SJW4rme7H/pubhtml) ================================================ FILE: cohorts/2022/week_1_basics_n_setup/homework.md ================================================ ## Week 1 Homework In this homework we'll prepare the environment and practice with terraform and SQL ## Question 1. Google Cloud SDK Install Google Cloud SDK. What's the version you have? To get the version, run `gcloud --version` ## Google Cloud account Create an account in Google Cloud and create a project. ## Question 2. Terraform Now install terraform and go to the terraform directory (`week_1_basics_n_setup/1_terraform_gcp/terraform`) After that, run * `terraform init` * `terraform plan` * `terraform apply` Apply the plan and copy the output (after running `apply`) to the form. It should be the entire output - from the moment you typed `terraform init` to the very end. ## Prepare Postgres Run Postgres and load data as shown in the videos We'll use the yellow taxi trips from January 2021: ```bash wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.csv ``` You will also need the dataset with zones: ```bash wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv ``` Download this data and put it to Postgres ## Question 3. Count records How many taxi trips were there on January 15? Consider only trips that started on January 15. ## Question 4. Largest tip for each day Find the largest tip for each day. On which day it was the largest tip in January? Use the pick up time for your calculations. (note: it's not a typo, it's "tip", not "trip") ## Question 5. Most popular destination What was the most popular destination for passengers picked up in central park on January 14? Use the pick up time for your calculations. Enter the zone name (not id). If the zone name is unknown (missing), write "Unknown" ## Question 6. Most expensive locations What's the pickup-dropoff pair with the largest average price for a ride (calculated based on `total_amount`)? Enter two zone names separated by a slash For example: "Jamaica Bay / Clinton East" If any of the zone names are unknown (missing), write "Unknown". For example, "Unknown / Clinton East". ## Submitting the solutions * Form for submitting: https://forms.gle/yGQrkgRdVbiFs8Vd7 * You can submit your homework multiple times. In this case, only the last submission will be used. Deadline: 26 January (Wednesday), 22:00 CET ## Solution Here is the solution to questions 3-6: [video](https://www.youtube.com/watch?v=HxHqH2ARfxM&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb) ================================================ FILE: cohorts/2022/week_2_data_ingestion/README.md ================================================ ## Week 2: Data Ingestion ### Data Lake (GCS) * What is a Data Lake * ELT vs. ETL * Alternatives to components (S3/HDFS, Redshift, Snowflake etc.) :movie_camera: [Video](https://www.youtube.com/watch?v=W3Zm6rjOq70&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb) [Slides](https://docs.google.com/presentation/d/1RkH-YhBz2apIjYZAxUz2Uks4Pt51-fVWVN9CcH9ckyY/edit?usp=sharing) ### Introduction to Workflow orchestration * What is an Orchestration Pipeline? * What is a DAG? * [Video](https://www.youtube.com/watch?v=0yK7LXwYeD0&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb) ### Setting up Airflow locally * Setting up Airflow with Docker-Compose * [Video](https://www.youtube.com/watch?v=lqDMzReAtrw&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb) * More information in the [airflow folder](airflow) If you want to run a lighter version of Airflow with fewer services, check this [video](https://www.youtube.com/watch?v=A1p5LQ0zzaQ&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb). It's optional. ### Ingesting data to GCP with Airflow * Extraction: Download and unpack the data * Pre-processing: Convert this raw data to parquet * Upload the parquet files to GCS * Create an external table in BigQuery * [Video](https://www.youtube.com/watch?v=9ksX9REfL8w&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=19) ### Ingesting data to Local Postgres with Airflow * Converting the ingestion script for loading data to Postgres to Airflow DAG * [Video](https://www.youtube.com/watch?v=s2U8MWJH5xA&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb) ### Transfer service (AWS -> GCP) Moving files from AWS to GCP. You will need an AWS account for this. This section is optional * [Video 1](https://www.youtube.com/watch?v=rFOFTfD1uGk&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb) * [Video 2](https://www.youtube.com/watch?v=VhmmbqpIzeI&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb) ### Homework In the homework, you'll create a few DAGs for processing the NY Taxi data for 2019-2021 More information [here](homework.md) ## Community notes Did you take notes? You can share them here. * [Notes from Alvaro Navas](https://github.com/ziritrion/dataeng-zoomcamp/blob/main/notes/2_data_ingestion.md) * [Notes from Aaron Wright](https://github.com/ABZ-Aaron/DataEngineerZoomCamp/blob/master/week_2_data_ingestion/README.md) * [Notes from Abd](https://itnadigital.notion.site/Week-2-Data-Ingestion-ec2d0d36c0664bc4b8be6a554b2765fd) * [Blog post by Isaac Kargar](https://kargarisaac.github.io/blog/data%20engineering/jupyter/2022/01/25/data-engineering-w2.html) * [Blog, notes, walkthroughs by Sandy Behrens](https://learningdataengineering540969211.wordpress.com/2022/01/30/week-2-de-zoomcamp-2-3-2-ingesting-data-to-gcp-with-airflow/) * [Notes from Apurva Hegde](https://github.com/apuhegde/Airflow-LocalExecutor-In-Docker#readme) * [Notes from Vincenzo Galante](https://binchentso.notion.site/Data-Talks-Club-Data-Engineering-Zoomcamp-8699af8e7ff94ec49e6f9bdec8eb69fd) * Add your notes here (above this line) ================================================ FILE: cohorts/2022/week_2_data_ingestion/airflow/.env_example ================================================ # Custom COMPOSE_PROJECT_NAME=dtc-de GOOGLE_APPLICATION_CREDENTIALS=/.google/credentials/google_credentials.json AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=google-cloud-platform://?extra__google_cloud_platform__key_path=/.google/credentials/google_credentials.json # AIRFLOW_UID= GCP_PROJECT_ID= GCP_GCS_BUCKET= # Postgres POSTGRES_USER=airflow POSTGRES_PASSWORD=airflow POSTGRES_DB=airflow # Airflow AIRFLOW__CORE__EXECUTOR=LocalExecutor AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC=10 AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB} AIRFLOW_CONN_METADATA_DB=postgres+psycopg2://airflow:airflow@postgres:5432/airflow AIRFLOW_VAR__METADATA_DB_SCHEMA=airflow _AIRFLOW_WWW_USER_CREATE=True _AIRFLOW_WWW_USER_USERNAME=${_AIRFLOW_WWW_USER_USERNAME:airflow} _AIRFLOW_WWW_USER_PASSWORD=${_AIRFLOW_WWW_USER_PASSWORD:airflow} AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=True AIRFLOW__CORE__LOAD_EXAMPLES=False ================================================ FILE: cohorts/2022/week_2_data_ingestion/airflow/1_setup_official.md ================================================ ## Setup (Official) ### Pre-Reqs 1. For the sake of standardization across this workshop's config, rename your gcp-service-accounts-credentials file to `google_credentials.json` & store it in your `$HOME` directory ``` bash cd ~ && mkdir -p ~/.google/credentials/ mv| Name | Project | Social | Links and comments |
|---|---|---|---|
| Katharina Eichinger | Project | ![]() |
|
| Alia Hamwi | Project | ![]() |
|
| Emmanuel Ikpesu | Project | ![]() |
More infoLinks: |
| Sanya Syed | Project | ![]() |
More infoLinks: > I am excited about the prospect of securing a challenging role as a Data Engineer, where I can utilise my skills and expertise to contribute meaningfully to an organisation's data-driven initiatives. |
| Aminu Lawal | Project | ![]() |
|
| Lisa Reiber | Project | ![]() |
More infoLinks: > always happy to connect with other data enthusiasts over topics like low-budget data engineering solutions for non-profits or AI solutions for non-profits |
| Vincenzo Galante | Project | ![]() |
More info> Thank you for having this course! |
| Grzegorz Gątkowski | Project | ![]() |
|
| Matt Young | Project | ![]() |
More infoLinks: > Experienced Developer | Cloud & Data Enthusiast | Open to Cloud & Data Engineering Roles 🌩️ ➜ C#, SQL, JavaScript, Python | BI, Data Analytics | AWS, Azure, GCP Passionate about data pipelines, storage, and processing. Excited to implement advanced cloud solutions and enable data-driven insights. Seeking Data Engineering opportunities to leverage my extensive SQL/Data Analytics experience and to transition into the world of cloud-based data solutions. Let's connect and collaborate on innovative data projects! #DataEngineering #CloudTechnology |
| Sam Hatley | Project | ![]() |
|
| Evan Hofmeister | Project | ![]() |
|
| Barys Kazarkin | Project | ![]() |
|
| Joshua Ati | Project | ![]() |
|
| Oleg Agapov | Project | ![]() |
|
| Mikhail Kuklin | Project | ![]() |
More infoLinks: |
| Emmanuel Letremble | Project | ![]() |
More infoLinks: > Thanks to the DataTalks.Club for completing my Full Stack & Machine Learning skill sets with some extra DE knowledge. |
| Victor Kuang | Project | ![]() |
|
| Antonis Angelakis | Project | ![]() |
|
| Christian Ruiz | |||
| Alex Pilugin | Project | ![]() |
|
| Ahmad Rizky | Project | ![]() |
|
| Juan Francisco Hernandez Hernandez | Project | ![]() |
More info> Thanks to Data Talks Club, it was amazing learning for me as a Career changer. |
| Iurii Chernigin | Project | ![]() |
|
| Franklyne Kibet | |||
| Federico Zambelli | Project | ![]() |
|
| Marilina Orihuela | Project | ![]() |
|
| Alejandro R. Mármol Ruiz | Project | ![]() |
|
| Daniel Takeshi | Project | ![]() |
|
| Xia He-Bleinagel | Project | ![]() |
More infoLinks: |
| Thorsten Foltz | ![]() |
||
| Danh Vo | Project | ![]() |
|
| Joseph Ologunja | Project | ![]() |
|
| Roman Zabolotin | Project | ![]() |
|
| Aditya Gupta | Project | ![]() |
More infoLinks: |
| Vladimir Bugaevskii | Project | ![]() |
|
| Fozan Talat | Project | ![]() |
|
| Alain Boisvert | Project | ![]() |
|
| reneboy garcia | Project | ![]() |
More info> "Success is not always about the grand achievements; it's about the small victories that accumulate over time." - Unknown |
| Svetlana Kononova | |||
| Dmitrii Nikolaev | ![]() |
More infoLinks: |
|
| Francis Romio | Project | ![]() |
|
| Saul Acevedo | Project | ![]() |
|
| Alina Li | Project | ![]() |
|
| Alexander Eryuzhev | Project | ![]() |
|
| Paul Nwosu | Project | ![]() |
More infoLinks:
|
| Param mirani | Project | ![]() |
|
| Oscar Garcia - ozkary | Project | ![]() |
|
| Hector Torres | Project | ![]() |
More infoLinks: > Currently looking for a position as data engineer |
| Dewi Nurfitri Oktaviani | Project | ![]() |
More infoLinks: |
| Ryno Marx | ![]() |
||
| Hidir Cem Altun | ![]() |
||
| Francis Mark Cayco | Project | ![]() |
|
| Adrian Baumann | Project | ![]() |
|
| Vladislav Garist | Project | ![]() |
|
| Gerald Ooi | ![]() |
||
| Roman | ![]() |
||
| Aleksandr Krasnov | ![]() |
More infoLinks: |
|
| Jaesung Ryu | Project | ![]() |
|
| António Damião Rodrigues | Project | ![]() |
|
| Alicia Escontrela | Project | ![]() |
|
| Chalermdej Lematavekul | Project | ![]() |
More info> Thank you so much for the course. Learn so many thing from here. |
| Muhammed Jimoh | Project | ![]() |
|
| Bartosz Skłodowski | Project | ![]() |
|
| Daniel Rigney | Project | ![]() |
|
| Daniel Gheorghita | Project | ![]() |
|
| Daniel Gheorghita | Project | ![]() |
|
| Niel Kemp | ![]() |
||
| Shahmir | Project | ![]() |
More infoLinks: > I've added a bunch of new features since the reviews! Check it out |
| Matt Bertrand | Project | ![]() |
|
| Nikolay Galkov | Project | ![]() |
|
| Hiroko Sakai | Project | ![]() |
|
| Rohit Joshi | Project | ![]() |
|
| Valerii Bazyrov | ![]() |
||
| Juan Pablo Ricapito | Project | ![]() |
|
| Ashraf Omara | Project | ![]() |
More info> I need to thank all of the data club community for this amazing contribution. |
| Wasawat Boonyarittikit | Project | ![]() |
|
| Fedor Faizov | Project | ![]() |
More info> Absolutly amazing course <3 |
You can load the data however you would like, but keep the files in .GZ Format. If you are using orchestration such as Airflow or Prefect do not load the data into Big Query using the orchestrator. Stop with loading the files into a bucket. NOTE: You can use the CSV option for the GZ files when creating an External Table SETUP: Create an external table using the fhv 2019 data. Create a table in BQ using the fhv 2019 data (do not partition or cluster this table). Data can be found here: https://github.com/DataTalksClub/nyc-tlc-data/releases/tag/fhv
## Question 1: What is the count for fhv vehicle records for year 2019? - 65,623,481 - 43,244,696 - 22,978,333 - 13,942,414 ## Question 2: Write a query to count the distinct number of affiliated_base_number for the entire dataset on both the tables. What is the estimated amount of data that will be read when this query is executed on the External Table and the Table? - 25.2 MB for the External Table and 100.87MB for the BQ Table - 225.82 MB for the External Table and 47.60MB for the BQ Table - 0 MB for the External Table and 0MB for the BQ Table - 0 MB for the External Table and 317.94MB for the BQ Table ## Question 3: How many records have both a blank (null) PUlocationID and DOlocationID in the entire dataset? - 717,748 - 1,215,687 - 5 - 20,332 ## Question 4: What is the best strategy to optimize the table if query always filter by pickup_datetime and order by affiliated_base_number? - Cluster on pickup_datetime Cluster on affiliated_base_number - Partition by pickup_datetime Cluster on affiliated_base_number - Partition by pickup_datetime Partition by affiliated_base_number - Partition by affiliated_base_number Cluster on pickup_datetime ## Question 5: Implement the optimized solution you chose for question 4. Write a query to retrieve the distinct affiliated_base_number between pickup_datetime 2019/03/01 and 2019/03/31 (inclusive). Use the BQ table you created earlier in your from clause and note the estimated bytes. Now change the table in the from clause to the partitioned table you created for question 4 and note the estimated bytes processed. What are these values? Choose the answer which most closely matches. - 12.82 MB for non-partitioned table and 647.87 MB for the partitioned table - 647.87 MB for non-partitioned table and 23.06 MB for the partitioned table - 582.63 MB for non-partitioned table and 0 MB for the partitioned table - 646.25 MB for non-partitioned table and 646.25 MB for the partitioned table ## Question 6: Where is the data stored in the External Table you created? - Big Query - GCP Bucket - Container Registry - Big Table ## Question 7: It is best practice in Big Query to always cluster your data: - True - False ## (Not required) Question 8: A better format to store these files may be parquet. Create a data pipeline to download the gzip files and convert them into parquet. Upload the files to your GCP Bucket and create an External and BQ Table. Note: Column types for all files used in an External Table must have the same datatype. While an External Table may be created and shown in the side panel in Big Query, this will need to be validated by running a count query on the External Table to check if any errors occur. ## Submitting the solutions * Form for submitting: https://forms.gle/rLdvQW2igsAT73HTA * You can submit your homework multiple times. In this case, only the last submission will be used. Deadline: 13 February (Monday), 22:00 CET ## Solution Solution: https://www.youtube.com/watch?v=j8r2OigKBWE ================================================ FILE: cohorts/2023/week_4_analytics_engineering/homework.md ================================================ ## Week 4 Homework In this homework, we'll use the models developed during the week 4 videos and enhance the already presented dbt project using the already loaded Taxi data for fhv vehicles for year 2019 in our DWH. This means that in this homework we use the following data [Datasets list](https://github.com/DataTalksClub/nyc-tlc-data/) * Yellow taxi data - Years 2019 and 2020 * Green taxi data - Years 2019 and 2020 * fhv data - Year 2019. We will use the data loaded for: * Building a source table: `stg_fhv_tripdata` * Building a fact table: `fact_fhv_trips` * Create a dashboard If you don't have access to GCP, you can do this locally using the ingested data from your Postgres database instead. If you have access to GCP, you don't need to do it for local Postgres - only if you want to. > **Note**: if your answer doesn't match exactly, select the closest option ### Question 1: **What is the count of records in the model fact_trips after running all models with the test run variable disabled and filtering for 2019 and 2020 data only (pickup datetime)?** You'll need to have completed the ["Build the first dbt models"](https://www.youtube.com/watch?v=UVI30Vxzd6c) video and have been able to run the models via the CLI. You should find the views and models for querying in your DWH. - 41648442 - 51648442 - 61648442 - 71648442 ### Question 2: **What is the distribution between service type filtering by years 2019 and 2020 data as done in the videos?** You will need to complete "Visualising the data" videos, either using [google data studio](https://www.youtube.com/watch?v=39nLTs74A3E) or [metabase](https://www.youtube.com/watch?v=BnLkrA7a6gM). - 89.9/10.1 - 94/6 - 76.3/23.7 - 99.1/0.9 ### Question 3: **What is the count of records in the model stg_fhv_tripdata after running all models with the test run variable disabled (:false)?** Create a staging model for the fhv data for 2019 and do not add a deduplication step. Run it via the CLI without limits (is_test_run: false). Filter records with pickup time in year 2019. - 33244696 - 43244696 - 53244696 - 63244696 ### Question 4: **What is the count of records in the model fact_fhv_trips after running all dependencies with the test run variable disabled (:false)?** Create a core model for the stg_fhv_tripdata joining with dim_zones. Similar to what we've done in fact_trips, keep only records with known pickup and dropoff locations entries for pickup and dropoff locations. Run it via the CLI without limits (is_test_run: false) and filter records with pickup time in year 2019. - 12998722 - 22998722 - 32998722 - 42998722 ### Question 5: **What is the month with the biggest amount of rides after building a tile for the fact_fhv_trips table?** Create a dashboard with some tiles that you find interesting to explore the data. One tile should show the amount of trips per month, as done in the videos for fact_trips, based on the fact_fhv_trips table. - March - April - January - December ## Submitting the solutions * Form for submitting: https://forms.gle/6A94GPutZJTuT5Y16 * You can submit your homework multiple times. In this case, only the last submission will be used. Deadline: 25 February (Saturday), 22:00 CET ## Solution * Video: https://www.youtube.com/watch?v=I_K0lNu9WQw&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW * Answers: * Question 1: 61648442, * Question 2: 89.9/10.1 * Question 3: 43244696 * Question 4: 22998722 * Question 5: January ================================================ FILE: cohorts/2023/week_5_batch_processing/homework.md ================================================ ## Week 5 Homework In this homework we'll put what we learned about Spark in practice. For this homework we will be using the FHVHV 2021-06 data found here. [FHVHV Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-06.csv.gz ) ### Question 1: **Install Spark and PySpark** - Install Spark - Run PySpark - Create a local spark session - Execute spark.version. What's the output? - 3.3.2 - 2.1.4 - 1.2.3 - 5.4 ### Question 2: **HVFHW June 2021** Read it with Spark using the same schema as we did in the lessons. We will use this dataset for all the remaining questions. Repartition it to 12 partitions and save it to parquet. What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches. - 2MB - 24MB - 100MB - 250MB ### Question 3: **Count records** How many taxi trips were there on June 15? Consider only trips that started on June 15. - 308,164 - 12,856 - 452,470 - 50,982 ### Question 4: **Longest trip for each day** Now calculate the duration for each trip. How long was the longest trip in Hours? - 66.87 Hours - 243.44 Hours - 7.68 Hours - 3.32 Hours ### Question 5: **User Interface** Spark’s User Interface which shows application's dashboard runs on which local port? - 80 - 443 - 4040 - 8080 ### Question 6: **Most frequent pickup location zone** Load the zone lookup data into a temp view in Spark [Zone Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv) Using the zone lookup data and the fhvhv June 2021 data, what is the name of the most frequent pickup location zone? - East Chelsea - Astoria - Union Sq - Crown Heights North ## Submitting the solutions * Form for submitting: https://forms.gle/EcSvDs6vp64gcGuD8 * You can submit your homework multiple times. In this case, only the last submission will be used. Deadline: 06 March (Monday), 22:00 CET ## Solution * Video: https://www.youtube.com/watch?v=ldoDIT32pJs * Answers: * Question 1: 3.3.2 * Question 2: 24MB * Question 3: 452,470 * Question 4: 66.87 Hours * Question 5: 4040 * Question 6: Crown Heights North ================================================ FILE: cohorts/2023/week_6_stream_processing/client.properties ================================================ # Required connection configs for Kafka producer, consumer, and admin bootstrap.servers=For this homework we will be using the 2022 Green Taxi Trip Record Parquet Files from the New York City Taxi Data found here: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page If you are using orchestration such as Mage, Airflow or Prefect do not load the data into Big Query using the orchestrator. Stop with loading the files into a bucket. NOTE: You will need to use the PARQUET option files when creating an External Table SETUP: Create an external table using the Green Taxi Trip Records Data for 2022. Create a table in BQ using the Green Taxi Trip Records for 2022 (do not partition or cluster this table).
## Question 1: Question 1: What is count of records for the 2022 Green Taxi Data?? - 65,623,481 - 840,402 - 1,936,423 - 253,647 ## Question 2: Write a query to count the distinct number of PULocationIDs for the entire dataset on both the tables. What is the estimated amount of data that will be read when this query is executed on the External Table and the Table? - 0 MB for the External Table and 6.41MB for the Materialized Table - 18.82 MB for the External Table and 47.60 MB for the Materialized Table - 0 MB for the External Table and 0MB for the Materialized Table - 2.14 MB for the External Table and 0MB for the Materialized Table ## Question 3: How many records have a fare_amount of 0? - 12,488 - 128,219 - 112 - 1,622 ## Question 4: What is the best strategy to make an optimized table in Big Query if your query will always order the results by PUlocationID and filter based on lpep_pickup_datetime? (Create a new table with this strategy) - Cluster on lpep_pickup_datetime Partition by PUlocationID - Partition by lpep_pickup_datetime Cluster on PUlocationID - Partition by lpep_pickup_datetime and Partition by PUlocationID - Cluster on by lpep_pickup_datetime and Cluster on PUlocationID ## Question 5: Write a query to retrieve the distinct PULocationID between lpep_pickup_datetime 06/01/2022 and 06/30/2022 (inclusive) Use the materialized table you created earlier in your from clause and note the estimated bytes. Now change the table in the from clause to the partitioned table you created for question 4 and note the estimated bytes processed. What are these values? Choose the answer which most closely matches. - 22.82 MB for non-partitioned table and 647.87 MB for the partitioned table - 12.82 MB for non-partitioned table and 1.12 MB for the partitioned table - 5.63 MB for non-partitioned table and 0 MB for the partitioned table - 10.31 MB for non-partitioned table and 10.31 MB for the partitioned table ## Question 6: Where is the data stored in the External Table you created? - Big Query - GCP Bucket - Big Table - Container Registry ## Question 7: It is best practice in Big Query to always cluster your data: - True - False ## (Bonus: Not worth points) Question 8: No Points: Write a `SELECT count(*)` query FROM the materialized table you created. How many bytes does it estimate will be read? Why? ## Submitting the solutions * Form for submitting: https://courses.datatalks.club/de-zoomcamp-2024/homework/hw3 ================================================ FILE: cohorts/2024/04-analytics-engineering/homework.md ================================================ ## Module 4 Homework In this homework, we'll use the models developed during the week 4 videos and enhance the already presented dbt project using the already loaded Taxi data for fhv vehicles for year 2019 in our DWH. This means that in this homework we use the following data [Datasets list](https://github.com/DataTalksClub/nyc-tlc-data/) * Yellow taxi data - Years 2019 and 2020 * Green taxi data - Years 2019 and 2020 * fhv data - Year 2019. We will use the data loaded for: * Building a source table: `stg_fhv_tripdata` * Building a fact table: `fact_fhv_trips` * Create a dashboard If you don't have access to GCP, you can do this locally using the ingested data from your Postgres database instead. If you have access to GCP, you don't need to do it for local Postgres - only if you want to. > **Note**: if your answer doesn't match exactly, select the closest option ### Question 1: **What happens when we execute dbt build --vars '{'is_test_run':'true'}'** You'll need to have completed the ["Build the first dbt models"](https://www.youtube.com/watch?v=UVI30Vxzd6c) video. - It's the same as running *dbt build* - It applies a _limit 100_ to all of our models - It applies a _limit 100_ only to our staging models - Nothing ### Question 2: **What is the code that our CI job will run? Where is this code coming from?** - The code that has been merged into the main branch - The code that is behind the creation object on the dbt_cloud_pr_ schema - The code from any development branch that has been opened based on main - The code from the development branch we are requesting to merge to main ### Question 3 (2 points) **What is the count of records in the model fact_fhv_trips after running all dependencies with the test run variable disabled (:false)?** Create a staging model for the fhv data, similar to the ones made for yellow and green data. Add an additional filter for keeping only records with pickup time in year 2019. Do not add a deduplication step. Run this models without limits (is_test_run: false). Create a core model similar to fact trips, but selecting from stg_fhv_tripdata and joining with dim_zones. Similar to what we've done in fact_trips, keep only records with known pickup and dropoff locations entries for pickup and dropoff locations. Run the dbt model without limits (is_test_run: false). - 12998722 - 22998722 - 32998722 - 42998722 ### Question 4 (2 points) **What is the service that had the most rides during the month of July 2019 month with the biggest amount of rides after building a tile for the fact_fhv_trips table and the fact_trips tile as seen in the videos?** Create a dashboard with some tiles that you find interesting to explore the data. One tile should show the amount of trips per month, as done in the videos for fact_trips, including the fact_fhv_trips data. - FHV - Green - Yellow - FHV and Green ## Submitting the solutions * Form for submitting: https://courses.datatalks.club/de-zoomcamp-2024/homework/hw4 Deadline: 22 February (Thursday), 22:00 CET ## Solution (To be published after deadline) * Video: https://youtu.be/3OPggh5Rca8 * Answers: * Question 1: It applies a _limit 100_ only to our staging models * Question 2: The code from the development branch we are requesting to merge to main * Question 3: 22998722 * Question 4: Yellow ================================================ FILE: cohorts/2024/05-batch/homework.md ================================================ ## Module 5 Homework Solution: https://www.youtube.com/watch?v=YtddC7vJOgQ In this homework we'll put what we learned about Spark in practice. For this homework we will be using the FHV 2019-10 data found here. [FHV Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz) ### Question 1: **Install Spark and PySpark** - Install Spark - Run PySpark - Create a local spark session - Execute spark.version. What's the output? > [!NOTE] > To install PySpark follow this [guide](https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/05-batch/setup/pyspark.md) ### Question 2: **FHV October 2019** Read the October 2019 FHV into a Spark Dataframe with a schema as we did in the lessons. Repartition the Dataframe to 6 partitions and save it to parquet. What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches. - 1MB - 6MB - 25MB - 87MB ### Question 3: **Count records** How many taxi trips were there on the 15th of October? Consider only trips that started on the 15th of October. - 108,164 - 12,856 - 452,470 - 62,610 > [!IMPORTANT] > Be aware of columns order when defining schema ### Question 4: **Longest trip for each day** What is the length of the longest trip in the dataset in hours? - 631,152.50 Hours - 243.44 Hours - 7.68 Hours - 3.32 Hours ### Question 5: **User Interface** Spark’s User Interface which shows the application's dashboard runs on which local port? - 80 - 443 - 4040 - 8080 ### Question 6: **Least frequent pickup location zone** Load the zone lookup data into a temp view in Spark [Zone Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv) Using the zone lookup data and the FHV October 2019 data, what is the name of the LEAST frequent pickup location Zone? - East Chelsea - Jamaica Bay - Union Sq - Crown Heights North ## Submitting the solutions - Form for submitting: https://courses.datatalks.club/de-zoomcamp-2024/homework/hw5 - Deadline: See the website ================================================ FILE: cohorts/2024/06-streaming/docker-compose.yml ================================================ version: '3.7' services: # Redpanda cluster redpanda-1: image: docker.redpanda.com/vectorized/redpanda:v22.3.5 container_name: redpanda-1 command: - redpanda - start - --smp - '1' - --reserve-memory - 0M - --overprovisioned - --node-id - '1' - --kafka-addr - PLAINTEXT://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092 - --advertise-kafka-addr - PLAINTEXT://redpanda-1:29092,OUTSIDE://localhost:9092 - --pandaproxy-addr - PLAINTEXT://0.0.0.0:28082,OUTSIDE://0.0.0.0:8082 - --advertise-pandaproxy-addr - PLAINTEXT://redpanda-1:28082,OUTSIDE://localhost:8082 - --rpc-addr - 0.0.0.0:33145 - --advertise-rpc-addr - redpanda-1:33145 ports: # - 8081:8081 - 8082:8082 - 9092:9092 - 28082:28082 - 29092:29092 ================================================ FILE: cohorts/2024/06-streaming/homework.md ================================================ ## Module 6 Homework In this homework, we're going to extend Module 5 Homework and learn about streaming with PySpark. Instead of Kafka, we will use Red Panda, which is a drop-in replacement for Kafka. Ensure you have the following set up (if you had done the previous homework and the module): - Docker (see [module 1](https://github.com/DataTalksClub/data-engineering-zoomcamp/tree/main/01-docker-terraform)) - PySpark (see [module 5](https://github.com/DataTalksClub/data-engineering-zoomcamp/tree/main/05-batch/setup)) For this homework we will be using the files from Module 5 homework: - Green 2019-10 data from [here](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz) ## Start Red Panda Let's start redpanda in a docker container. There's a `docker-compose.yml` file in the homework folder (taken from [here](https://github.com/redpanda-data-blog/2023-python-gsg/blob/main/docker-compose.yml)) Copy this file to your homework directory and run ```bash docker-compose up ``` (Add `-d` if you want to run in detached mode) ## Question 1: Redpanda version Now let's find out the version of redpandas. For that, check the output of the command `rpk help` _inside the container_. The name of the container is `redpanda-1`. Find out what you need to execute based on the `help` output. What's the version, based on the output of the command you executed? (copy the entire version) ## Question 2. Creating a topic Before we can send data to the redpanda server, we need to create a topic. We do it also with the `rpk` command we used previously for figuring out the version of redpandas. Read the output of `help` and based on it, create a topic with name `test-topic` What's the output of the command for creating a topic? Include the entire output in your answer. ## Question 3. Connecting to the Kafka server We need to make sure we can connect to the server, so later we can send some data to its topics First, let's install the kafka connector (up to you if you want to have a separate virtual environment for that) ```bash pip install kafka-python ``` You can start a jupyter notebook in your solution folder or create a script Let's try to connect to our server: ```python import json import time from kafka import KafkaProducer def json_serializer(data): return json.dumps(data).encode('utf-8') server = 'localhost:9092' producer = KafkaProducer( bootstrap_servers=[server], value_serializer=json_serializer ) producer.bootstrap_connected() ``` Provided that you can connect to the server, what's the output of the last command? ## Question 4. Sending data to the stream Now we're ready to send some test data: ```python t0 = time.time() topic_name = 'test-topic' for i in range(10): message = {'number': i} producer.send(topic_name, value=message) print(f"Sent: {message}") time.sleep(0.05) producer.flush() t1 = time.time() print(f'took {(t1 - t0):.2f} seconds') ``` How much time did it take? Where did it spend most of the time? * Sending the messages * Flushing * Both took approximately the same amount of time (Don't remove `time.sleep` when answering this question) ## Reading data with `rpk` You can see the messages that you send to the topic with `rpk`: ```bash rpk topic consume test-topic ``` Run the command above and send the messages one more time to see them ## Sending the taxi data Now let's send our actual data: * Read the green csv.gz file * We will only need these columns: * `'lpep_pickup_datetime',` * `'lpep_dropoff_datetime',` * `'PULocationID',` * `'DOLocationID',` * `'passenger_count',` * `'trip_distance',` * `'tip_amount'` Iterate over the records in the dataframe ```python for row in df_green.itertuples(index=False): row_dict = {col: getattr(row, col) for col in row._fields} print(row_dict) break # TODO implement sending the data here ``` Note: this way of iterating over the records is more efficient compared to `iterrows` ## Question 5: Sending the Trip Data * Create a topic `green-trips` and send the data there * How much time in seconds did it take? (You can round it to a whole number) * Make sure you don't include sleeps in your code ## Creating the PySpark consumer Now let's read the data with PySpark. Spark needs a library (jar) to be able to connect to Kafka, so we need to tell PySpark that it needs to use it: ```python import pyspark from pyspark.sql import SparkSession pyspark_version = pyspark.__version__ kafka_jar_package = f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version}" spark = SparkSession \ .builder \ .master("local[*]") \ .appName("GreenTripsConsumer") \ .config("spark.jars.packages", kafka_jar_package) \ .getOrCreate() ``` Now we can connect to the stream: ```python green_stream = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "green-trips") \ .option("startingOffsets", "earliest") \ .load() ``` In order to test that we can consume from the stream, let's see what will be the first record there. In Spark streaming, the stream is represented as a sequence of small batches, each batch being a small RDD (or a small dataframe). So we can execute a function over each mini-batch. Let's run `take(1)` there to see what do we have in the stream: ```python def peek(mini_batch, batch_id): first_row = mini_batch.take(1) if first_row: print(first_row[0]) query = green_stream.writeStream.foreachBatch(peek).start() ``` You should see a record like this: ``` Row(key=None, value=bytearray(b'{"lpep_pickup_datetime": "2019-10-01 00:26:02", "lpep_dropoff_datetime": "2019-10-01 00:39:58", "PULocationID": 112, "DOLocationID": 196, "passenger_count": 1.0, "trip_distance": 5.88, "tip_amount": 0.0}'), topic='green-trips', partition=0, offset=0, timestamp=datetime.datetime(2024, 3, 12, 22, 42, 9, 411000), timestampType=0) ``` Now let's stop the query, so it doesn't keep consuming messages from the stream ```python query.stop() ``` ## Question 6. Parsing the data The data is JSON, but currently it's in binary format. We need to parse it and turn it into a streaming dataframe with proper columns. Similarly to PySpark, we define the schema ```python from pyspark.sql import types schema = types.StructType() \ .add("lpep_pickup_datetime", types.StringType()) \ .add("lpep_dropoff_datetime", types.StringType()) \ .add("PULocationID", types.IntegerType()) \ .add("DOLocationID", types.IntegerType()) \ .add("passenger_count", types.DoubleType()) \ .add("trip_distance", types.DoubleType()) \ .add("tip_amount", types.DoubleType()) ``` And apply this schema: ```python from pyspark.sql import functions as F green_stream = green_stream \ .select(F.from_json(F.col("value").cast('STRING'), schema).alias("data")) \ .select("data.*") ``` How does the record look after parsing? Copy the output. ### Question 7: Most popular destination Now let's finally do some streaming analytics. We will see what's the most popular destination currently based on our stream of data (which ideally we should have sent with delays like we did in workshop 2) This is how you can do it: * Add a column "timestamp" using the `current_timestamp` function * Group by: * 5 minutes window based on the timestamp column (`F.window(col("timestamp"), "5 minutes")`) * `"DOLocationID"` * Order by count You can print the output to the console using this code ```python query = popular_destinations \ .writeStream \ .outputMode("complete") \ .format("console") \ .option("truncate", "false") \ .start() query.awaitTermination() ``` Write the most popular destination, your answer should be *either* the zone ID or the zone name of this destination. (You will need to re-send the data for this to work) ## Submitting the solutions * Form for submitting: https://courses.datatalks.club/de-zoomcamp-2024/homework/hw6 ## Solution We will publish the solution here after deadline. ================================================ FILE: cohorts/2024/README.md ================================================ ## Data Engineering Zoomcamp 2024 Cohort * [Pre-launch Q&A stream](https://www.youtube.com/watch?v=91b8u9GmqB4) * [Launch stream with course overview](https://www.youtube.com/live/AtRhA-NfS24?si=5JzA_E8BmJjiLi8l) * [Deadline calendar](https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml) * [FAQ](https://datatalks.club/faq/data-engineering-zoomcamp.html) * Course Playlist: Only 2024 Live videos & homeworks (TODO) * [Public Leaderboard of Top-100 Participants](leaderboard.md) [**Module 1: Introduction & Prerequisites**](01-docker-terraform/) * [Homework](01-docker-terraform/homework.md) [**Module 2: Workflow Orchestration**](02-workflow-orchestration) * [Homework](02-workflow-orchestration/homework.md) * Office hours [**Workshop 1: Data Ingestion**](workshops/dlt.md) * Workshop with dlt * [Homework](workshops/dlt.md) [**Module 3: Data Warehouse**](03-data-warehouse) * [Homework](03-data-warehouse/homework.md) [**Module 4: Analytics Engineering**](04-analytics-engineering/) * [Homework](04-analytics-engineering/homework.md) [**Module 5: Batch processing**](05-batch/) * [Homework](05-batch/homework.md) [**Module 6: Stream Processing**](06-streaming) * [Homework](06-streaming/homework.md) [**Project**](project.md) More information [here](project.md) ================================================ FILE: cohorts/2024/leaderboard.md ================================================ ## Leaderboard This is the top [100 leaderboard](https://courses.datatalks.club/de-zoomcamp-2024/leaderboard) of participants of Data Engineering Zoomcamp 2024 edition!| \n", " | id | \n", "name | \n", "age | \n", "city | \n", "_dlt_load_id | \n", "_dlt_id | \n", "occupation | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "1 | \n", "Person_1 | \n", "26 | \n", "City_A | \n", "1706029306.7456656 | \n", "An8WyXL43/J1GQ | \n", "None | \n", "
| 1 | \n", "2 | \n", "Person_2 | \n", "27 | \n", "City_A | \n", "1706029306.7456656 | \n", "ZGI1S72CddPbJQ | \n", "None | \n", "
| 2 | \n", "3 | \n", "Person_3 | \n", "28 | \n", "City_A | \n", "1706029306.7456656 | \n", "+z4Pm5oCykL2Vg | \n", "None | \n", "
| 3 | \n", "4 | \n", "Person_4 | \n", "29 | \n", "City_A | \n", "1706029306.7456656 | \n", "0Vfr36JHZ34OJA | \n", "None | \n", "
| 4 | \n", "5 | \n", "Person_5 | \n", "30 | \n", "City_A | \n", "1706029306.7456656 | \n", "aA+9WOclw3YWpg | \n", "None | \n", "
| 5 | \n", "3 | \n", "Person_3 | \n", "33 | \n", "City_B | \n", "1706029307.9851513 | \n", "mEegoM7n4XujYw | \n", "Job_3 | \n", "
| 6 | \n", "4 | \n", "Person_4 | \n", "34 | \n", "City_B | \n", "1706029307.9851513 | \n", "FPrsrzXgz+E9Fw | \n", "Job_4 | \n", "
| 7 | \n", "5 | \n", "Person_5 | \n", "35 | \n", "City_B | \n", "1706029307.9851513 | \n", "ZaAOBa5EEqXU1Q | \n", "Job_5 | \n", "
| 8 | \n", "6 | \n", "Person_6 | \n", "36 | \n", "City_B | \n", "1706029307.9851513 | \n", "gmcktDnX6y4Fmg | \n", "Job_6 | \n", "
| 9 | \n", "7 | \n", "Person_7 | \n", "37 | \n", "City_B | \n", "1706029307.9851513 | \n", "960gdVKySsa4JA | \n", "Job_7 | \n", "
| 10 | \n", "8 | \n", "Person_8 | \n", "38 | \n", "City_B | \n", "1706029307.9851513 | \n", "+su5IfZQyFEsEw | \n", "Job_8 | \n", "
| \n", " | id | \n", "name | \n", "age | \n", "city | \n", "occupation | \n", "_dlt_load_id | \n", "_dlt_id | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "8 | \n", "Person_8 | \n", "38 | \n", "City_B | \n", "Job_8 | \n", "1706030294.7037766 | \n", "Q1k+DIAjXLL7cg | \n", "
| 1 | \n", "4 | \n", "Person_4 | \n", "34 | \n", "City_B | \n", "Job_4 | \n", "1706030294.7037766 | \n", "ewlZ3LjULEchiQ | \n", "
| 2 | \n", "5 | \n", "Person_5 | \n", "35 | \n", "City_B | \n", "Job_5 | \n", "1706030294.7037766 | \n", "X+LfQEa/X8GU9w | \n", "
| 3 | \n", "7 | \n", "Person_7 | \n", "37 | \n", "City_B | \n", "Job_7 | \n", "1706030294.7037766 | \n", "lQT0h7IL7E/wxg | \n", "
| 4 | \n", "3 | \n", "Person_3 | \n", "33 | \n", "City_B | \n", "Job_3 | \n", "1706030294.7037766 | \n", "gRBswCo8B/DJmw | \n", "
| 5 | \n", "6 | \n", "Person_6 | \n", "36 | \n", "City_B | \n", "Job_6 | \n", "1706030294.7037766 | \n", "M3IbNKfZZCtbcQ | \n", "
Documentation 📑 Hands-on Tutorials 🎯 RisingWave Cloud 🚀 Get Instant Help
## Stream processing with RisingWave In this hands-on workshop, we’ll learn how to process real-time streaming data using SQL in RisingWave. The system we’ll use is [RisingWave](https://github.com/risingwavelabs/risingwave), an open-source SQL database for processing and managing streaming data. You may not feel unfamiliar with RisingWave’s user experience, as it’s fully wire compatible with PostgreSQL.  We’ll cover the following topics in this Workshop: - Why Stream Processing? - Stateless computation (Filters, Projections) - Stateful Computation (Aggregations, Joins) - Data Ingestion and Delivery RisingWave in 10 Minutes: https://tutorials.risingwave.com/docs/intro Workshop video: For this homework we will be using the Yellow Taxi Trip Records for **January 2024 - June 2024 NOT the entire year of data**
Parquet Files from the New York
City Taxi Data found here: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
If you are using orchestration such as Kestra, Mage, Airflow or Prefect etc. do not load the data into Big Query using the orchestrator.
Stop with loading the files into a bucket.
**Load Script:** You can manually download the parquet files and upload them to your GCS Bucket or you can use the linked script [here](./load_yellow_taxi_data.py):
You will simply need to generate a Service Account with GCS Admin Priveleges or be authenticated with the Google SDK and update the bucket name in the script to the name of your bucket
Nothing is fool proof so make sure that all 6 files show in your GCS Bucket before beginning.
NOTE: You will need to use the PARQUET option files when creating an External Table
BIG QUERY SETUP:
Create an external table using the Yellow Taxi Trip Records.
Create a (regular/materialized) table in BQ using the Yellow Taxi Trip Records (do not partition or cluster this table).
| \n", " | VendorID | \n", "lpep_pickup_datetime | \n", "lpep_dropoff_datetime | \n", "store_and_fwd_flag | \n", "RatecodeID | \n", "PULocationID | \n", "DOLocationID | \n", "passenger_count | \n", "trip_distance | \n", "fare_amount | \n", "extra | \n", "mta_tax | \n", "tip_amount | \n", "tolls_amount | \n", "ehail_fee | \n", "improvement_surcharge | \n", "total_amount | \n", "payment_type | \n", "trip_type | \n", "congestion_surcharge | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "2.0 | \n", "2019-10-01 00:26:02 | \n", "2019-10-01 00:39:58 | \n", "N | \n", "1.0 | \n", "112 | \n", "196 | \n", "1.0 | \n", "5.88 | \n", "18.0 | \n", "0.50 | \n", "0.5 | \n", "0.00 | \n", "0.0 | \n", "NaN | \n", "0.3 | \n", "19.30 | \n", "2.0 | \n", "1.0 | \n", "0.0 | \n", "
| 1 | \n", "1.0 | \n", "2019-10-01 00:18:11 | \n", "2019-10-01 00:22:38 | \n", "N | \n", "1.0 | \n", "43 | \n", "263 | \n", "1.0 | \n", "0.80 | \n", "5.0 | \n", "3.25 | \n", "0.5 | \n", "0.00 | \n", "0.0 | \n", "NaN | \n", "0.3 | \n", "9.05 | \n", "2.0 | \n", "1.0 | \n", "0.0 | \n", "
| 2 | \n", "1.0 | \n", "2019-10-01 00:09:31 | \n", "2019-10-01 00:24:47 | \n", "N | \n", "1.0 | \n", "255 | \n", "228 | \n", "2.0 | \n", "7.50 | \n", "21.5 | \n", "0.50 | \n", "0.5 | \n", "0.00 | \n", "0.0 | \n", "NaN | \n", "0.3 | \n", "22.80 | \n", "2.0 | \n", "1.0 | \n", "0.0 | \n", "
| 3 | \n", "1.0 | \n", "2019-10-01 00:37:40 | \n", "2019-10-01 00:41:49 | \n", "N | \n", "1.0 | \n", "181 | \n", "181 | \n", "1.0 | \n", "0.90 | \n", "5.5 | \n", "0.50 | \n", "0.5 | \n", "0.00 | \n", "0.0 | \n", "NaN | \n", "0.3 | \n", "6.80 | \n", "2.0 | \n", "1.0 | \n", "0.0 | \n", "
| 4 | \n", "2.0 | \n", "2019-10-01 00:08:13 | \n", "2019-10-01 00:17:56 | \n", "N | \n", "1.0 | \n", "97 | \n", "188 | \n", "1.0 | \n", "2.52 | \n", "10.0 | \n", "0.50 | \n", "0.5 | \n", "2.26 | \n", "0.0 | \n", "NaN | \n", "0.3 | \n", "13.56 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "
In this notebook we will use:
\n", "\n", "\n", " DuckDB is great for beginners because it requires no setup and no credentials.\n", "
" ] }, { "cell_type": "markdown", "metadata": { "id": "aQTSvnvnHWBd" }, "source": [ "## 📚 Step 1: Import Libraries" ] }, { "cell_type": "markdown", "metadata": { "id": "YFQGLTECWkpn" }, "source": [ "\n", "In this cell we import the libraries we will use throughout the notebook:
\n", "\n", "itertools) is a small Python helper for previewing only a few records\n", " In dlt, a source is the part of your pipeline that knows how to fetch data from somewhere.\n", " In this notebook, our source fetches data from the Open Library Search API.\n", "
\n", "\n", "\n",
" We define the source using rest_api_source, which lets us describe an API in a simple\n",
" Python dictionary instead of writing lots of request code.\n",
"
\n",
" 📖 Open Library Search API docs:
\n",
" \n",
" https://openlibrary.org/dev/docs/api/search\n",
" \n",
"
pipeline.run() do?\n",
" pipeline.run() simply combines the three steps we already executed manually:\n",
"
In other words, this:
\n", "\n", "pipeline.run(source)\n",
"\n",
"is equivalent to:
\n", "\n", "pipeline.extract(source)\n",
"pipeline.normalize()\n",
"pipeline.load()\n",
"\n",
"\n", " There is no hidden magic. It just runs the full ELT process in order.\n", "
\n" ] }, { "cell_type": "markdown", "metadata": { "id": "7ViMq6gIfJj_" }, "source": [ "## 🔎 Step 8: Inspect the Loaded Data\n", "\n", "Now that the data is loaded into DuckDB, we can inspect it using `pipeline.dataset()`.\n", "\n", "This gives us a convenient Python interface for exploring the tables that dlt created, without writing SQL.\n", "\n", "---\n", "\n", "### List available tables\n", "\n", "First, let’s see what tables exist in the dataset:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "id": "bmnrK1aVZXPO" }, "outputs": [], "source": [ "ds = pipeline.dataset()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SV6J6AtBf0xq", "outputId": "19ad26bf-f34a-4f8e-c30c-5acd3342c3c5" }, "outputs": [ { "data": { "text/plain": [ "['books',\n", " 'books__author_key',\n", " 'books__author_name',\n", " 'books__ia',\n", " 'books__ia_collection',\n", " 'books__language',\n", " 'books__id_standard_ebooks',\n", " 'books__id_librivox',\n", " 'books__id_project_gutenberg',\n", " '_dlt_version',\n", " '_dlt_loads',\n", " '_dlt_pipeline_state']" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ds.tables" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 315 }, "id": "WLa4yN7lf1TF", "outputId": "d2da841b-a8bf-461f-a011-eb1db644656f" }, "outputs": [ { "data": { "application/vnd.google.colaboratory.intrinsic+json": { "summary": "{\n \"name\": \"df\",\n \"rows\": 3756,\n \"fields\": [\n {\n \"column\": \"cover_edition_key\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1192,\n \"samples\": [\n \"OL24951484M\",\n \"OL9131663M\",\n \"OL47198575M\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"cover_i\",\n \"properties\": {\n \"dtype\": \"Int64\",\n \"num_unique_values\": 1288,\n \"samples\": [\n 842156,\n 10365881,\n 3341732\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ebook_access\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"printdisabled\",\n \"unclassified\",\n \"no_ebook\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"edition_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 108,\n \"min\": 0,\n \"max\": 3546,\n \"num_unique_values\": 62,\n \"samples\": [\n 44,\n 92,\n 396\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"first_publish_year\",\n \"properties\": {\n \"dtype\": \"Int64\",\n \"num_unique_values\": 127,\n \"samples\": [\n 2008,\n 1622,\n 1962\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"has_fulltext\",\n \"properties\": {\n \"dtype\": \"boolean\",\n \"num_unique_values\": 2,\n \"samples\": [\n false,\n true\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"key\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3756,\n \"samples\": [\n \"/works/OL34662215W\",\n \"/works/OL39702699W\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"lending_edition_s\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 281,\n \"samples\": [\n \"OL45637056M\",\n \"OL26064272M\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"lending_identifier_s\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 281,\n \"samples\": [\n \"alicesadventures0000unse_v7d2\",\n \"harrypottermagic0000unse_n5w6\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"public_scan_b\",\n \"properties\": {\n \"dtype\": \"boolean\",\n \"num_unique_values\": 2,\n \"samples\": [\n true,\n false\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2984,\n \"samples\": [\n \"1000 Facts and Trivia about Marvel Cinematic Universe, Game of Thrones, Disney, Star Wars, Harry Potter 1\",\n \"The Unofficial Harry Potter Insults Handbook\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"_dlt_load_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"1770819876.9353185\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"_dlt_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3756,\n \"samples\": [\n \"ZN3UfCkWBXFxSw\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"subtitle\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 59,\n \"samples\": [\n \"Hogwarts Through the Years\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", "type": "dataframe", "variable_name": "df" }, "text/html": [ "\n", "| \n", " | cover_edition_key | \n", "cover_i | \n", "ebook_access | \n", "edition_count | \n", "first_publish_year | \n", "has_fulltext | \n", "key | \n", "lending_edition_s | \n", "lending_identifier_s | \n", "public_scan_b | \n", "title | \n", "_dlt_load_id | \n", "_dlt_id | \n", "subtitle | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "OL61027601M | \n", "15155833 | \n", "borrowable | \n", "396 | \n", "1997 | \n", "True | \n", "/works/OL82563W | \n", "OL38565767M | \n", "harrypotterylapi0000rowl_q5r6 | \n", "False | \n", "Harry Potter and the Philosopher's Stone | \n", "1770819876.9353185 | \n", "lGJrV2BS8Z9qJQ | \n", "None | \n", "
| 1 | \n", "OL26378158M | \n", "15158660 | \n", "printdisabled | \n", "144 | \n", "2007 | \n", "True | \n", "/works/OL82586W | \n", "None | \n", "None | \n", "False | \n", "Harry Potter and the Deathly Hallows | \n", "1770819876.9353185 | \n", "F9W0WQlLwgvsFw | \n", "None | \n", "
| 2 | \n", "OL26234270M | \n", "10580435 | \n", "borrowable | \n", "278 | \n", "1999 | \n", "True | \n", "/works/OL82536W | \n", "OL48101764M | \n", "bdrc-W8LS66814 | \n", "False | \n", "Harry Potter and the Prisoner of Azkaban | \n", "1770819876.9353185 | \n", "kSdfO1XbBVAjmQ | \n", "None | \n", "
You can watch this video to see how your learning in public posts may look like:
## Daily Documentation
- **Post Daily Diaries**: Document what you learn each day, including the challenges faced and the methods used to overcome them.
- **Create Quick Videos**: Make short videos showcasing your work and upload them to GitHub.
Send a PR if you want to suggest improvements for this document
================================================
FILE: projects/README.md
================================================
## Course Project
[🎥 Projects how-to (watch it!)](https://www.youtube.com/watch?v=BL0E8xO8OnE)
### Objective
The goal of this project is to apply everything we have learned
in this course to build an end-to-end data pipeline.
### Problem statement
Develop a dashboard with two tiles by:
* Selecting a dataset of interest (see [Datasets](#datasets))
* Creating a pipeline for processing this dataset and putting it to a datalake
* Creating a pipeline for moving the data from the lake to a data warehouse
* Transforming the data in the data warehouse: prepare it for the dashboard
* Building a dashboard to visualize the data
## Data Pipeline
The pipeline could be **stream** or **batch**: this is the first thing you'll need to decide
* **Stream**: If you want to consume data in real-time and put them to data lake
* **Batch**: If you want to run things periodically (e.g. hourly/daily)
## Technologies
You don't have to limit yourself to technologies covered in the course. You can use alternatives as well:
* **Cloud**: AWS, GCP, Azure, ...
* **Infrastructure as code (IaC)**: Terraform, Pulumi, Cloud Formation, ...
* **Workflow orchestration**: Airflow, Prefect, Luigi, ...
* **Data Warehouse**: BigQuery, Snowflake, Redshift, ...
* **Batch processing**: Spark, Flink, AWS Batch, ...
* **Stream processing**: Kafka, Pulsar, Kinesis, ...
If you use a tool that wasn't covered in the course, be sure to explain what that tool does.
If you're not certain about some tools, ask in Slack.
## Dashboard
You can use any of the tools shown in the course (Looker Studio or Streamlit) or any other BI tool of your choice to build a dashboard. If you do use another tool, please specify and make sure that the dashboard is somehow accessible to your peers.
Your dashboard should contain at least two tiles, we suggest you include:
- 1 graph that shows the distribution of some categorical data
- 1 graph that shows the distribution of the data across a temporal line
Ensure that your graph is easy to understand by adding references and titles.
Example dashboard: 
## Peer reviewing
> [!IMPORTANT]
> To evaluate the projects, we'll use peer reviewing. This is a great opportunity for you to learn from each other.
> * To get points for your project, you need to evaluate 3 projects of your peers
> * You get 3 extra points for each evaluation
## Evaluation Criteria
* Problem description
* 0 points: Problem is not described
* 2 points: Problem is described but shortly or not clearly
* 4 points: Problem is well described and it's clear what the problem the project solves
* Cloud
* 0 points: Cloud is not used, things run only locally
* 2 points: The project is developed in the cloud
* 4 points: The project is developed in the cloud and IaC tools are used
* Data ingestion (choose either batch or stream)
* Batch / Workflow orchestration
* 0 points: No workflow orchestration
* 2 points: Partial workflow orchestration: some steps are orchestrated, some run manually
* 4 points: End-to-end pipeline: multiple steps in the DAG, uploading data to data lake
* Stream
* 0 points: No streaming system (like Kafka, Pulsar, etc)
* 2 points: A simple pipeline with one consumer and one producer
* 4 points: Using consumer/producers and streaming technologies (like Kafka streaming, Spark streaming, Flink, etc)
* Data warehouse
* 0 points: No DWH is used
* 2 points: Tables are created in DWH, but not optimized
* 4 points: Tables are partitioned and clustered in a way that makes sense for the upstream queries (with explanation)
* Transformations (dbt, spark, etc)
* 0 points: No tranformations
* 2 points: Simple SQL transformation (no dbt or similar tools)
* 4 points: Tranformations are defined with dbt, Spark or similar technologies
* Dashboard
* 0 points: No dashboard
* 2 points: A dashboard with 1 tile
* 4 points: A dashboard with 2 tiles
* Reproducibility
* 0 points: No instructions how to run the code at all
* 2 points: Some instructions are there, but they are not complete
* 4 points: Instructions are clear, it's easy to run the code, and the code works
> [!NOTE]
> It's highly recommended to create a new repository for your project (not inside an existing repo) with a meaningful title, such as
> "Quake Analytics Dashboard" or "Bike Data Insights" and include as many details as possible in the README file. ChatGPT can assist you with this. Doing so will not only make it easier to showcase your project for potential job opportunities but also have it featured on the [Projects Gallery App](#projects-gallery).
> If you leave the README file empty or with minimal details, there may be point deductions as per the [Evaluation Criteria](#evaluation-criteria).
## Going the extra mile (Optional)
> [!NOTE]
> The following things are not covered in the course, are entirely optional and they will not be graded.
However, implementing these could significantly enhance the quality of your project:
* Add tests
* Use make
* Add CI/CD pipeline
If you intend to include this project in your portfolio, adding these additional features will definitely help you to stand out from others.
## Cheating and plagiarism
Plagiarism in any form is not allowed. Examples of plagiarism:
* Taking somebody's else notebooks and projects (in full or partly) and using it for the capstone project
* Re-using your own projects (in full or partly) from other courses and bootcamps
* Re-using your midterm project from ML Zoomcamp in capstone
* Re-using your ML Zoomcamp from previous iterations of the course
Violating any of this will result in 0 points for this project.
## Resources
### Datasets
Refer to the provided [datasets](datasets.md) for possible selection.
### Helpful Links
* [Unit Tests + CI for Airflow](https://www.astronomer.io/events/recaps/testing-airflow-to-bulletproof-your-code/)
* [CI/CD for Airflow (with Gitlab & GCP state file)](https://engineering.ripple.com/building-ci-cd-with-airflow-gitlab-and-terraform-in-gcp)
* [CI/CD for Airflow (with GitHub and S3 state file)](https://programmaticponderings.com/2021/12/14/devops-for-dataops-building-a-ci-cd-pipeline-for-apache-airflow-dags/)
* [CD for Terraform](https://medium.com/towards-data-science/git-actions-terraform-for-data-engineers-scientists-gcp-aws-azure-448dc7c60fcc)
* [Spark + Airflow](https://medium.com/doubtnut/github-actions-airflow-for-automating-your-spark-pipeline-c9dff32686b)
### Projects Gallery
Explore a collection of projects completed by members of our community. The projects cover a wide range of topics and utilize different tools and techniques. Feel free to delve into any project and see how others have tackled real-world problems with data, structured their code, and presented their findings. It's a great resource to learn and get ideas for your own projects.
[](https://datatalksclub-projects.streamlit.app/)
### DE Zoomcamp 2023
* [2023 Projects](../cohorts/2023/project.md)
### DE Zoomcamp 2022
* [2022 Projects](../cohorts/2022/project.md)
================================================
FILE: projects/datasets.md
================================================
## Datasets
Here are some datasets that you could use for the project:
* [Kaggle](https://www.kaggle.com/datasets)
* [AWS datasets](https://registry.opendata.aws/)
* [UK government open data](https://data.gov.uk/)
* [Github archive](https://www.gharchive.org)
* [Awesome public datasets](https://github.com/awesomedata/awesome-public-datasets)
* [Million songs dataset](http://millionsongdataset.com)
* [Some random datasets](https://components.one/datasets/)
* [COVID Datasets](https://www.reddit.com/r/datasets/comments/n3ph2d/coronavirus_datsets/)
* [Datasets from Azure](https://docs.microsoft.com/en-us/azure/azure-sql/public-data-sets)
* [Datasets from BigQuery](https://cloud.google.com/bigquery/public-data/)
* [Dataset search engine from Google](https://datasetsearch.research.google.com/)
* [Public datasets offered by different GCP services](https://cloud.google.com/solutions/datasets)
* [European statistics datasets](https://ec.europa.eu/eurostat/data/database)
* [Datasets for streaming](https://github.com/ColinEberhardt/awesome-public-streaming-datasets)
* [Dataset for Santander bicycle rentals in London](https://cycling.data.tfl.gov.uk/)
* [Common crawl data](https://commoncrawl.org/) (copy of the internet)
* [NASA's EarthData](https://search.earthdata.nasa.gov/search) (May require introductory geospatial analysis)
* Collection Of Data Repositories
* [part 1](https://www.kdnuggets.com/2022/04/complete-collection-data-repositories-part-1.html) (from agriculture and finance to government)
* [part 2](https://www.kdnuggets.com/2022/04/complete-collection-data-repositories-part-2.html) (from healthcare to transportation)
* [Data For Good by Meta](https://dataforgood.facebook.com/dfg/tools)
PRs with more datasets are welcome!
It's not mandatory that you use a dataset from this list. You can use any dataset you want.
================================================
FILE: workshop-best-practices.md
================================================
# Workshop Best Practices
Preferences and patterns learned from building the PyFlink streaming workshop.
## Structure and Pacing
- Introduce services one at a time, not all at once. Start with one container
(e.g., Redpanda), explain it, use it. Then add the next (PostgreSQL), etc.
- Start with the simplest version that works (plain Python consumer), then
motivate the more complex tool (Flink) by showing what's missing.
- Use `docker compose up