Copy disabled (too large)
Download .txt
Showing preview only (16,197K chars total). Download the full file to get everything.
Repository: jakevdp/PythonDataScienceHandbook
Branch: master
Commit: d66231454ef7
Files: 187
Total size: 33.8 MB
Directory structure:
gitextract_10clvamz/
├── .gitignore
├── .gitmodules
├── LICENSE-CODE
├── LICENSE-TEXT
├── README.md
├── environment.yml
├── notebooks/
│ ├── 00.00-Preface.ipynb
│ ├── 01.00-IPython-Beyond-Normal-Python.ipynb
│ ├── 01.01-Help-And-Documentation.ipynb
│ ├── 01.02-Shell-Keyboard-Shortcuts.ipynb
│ ├── 01.03-Magic-Commands.ipynb
│ ├── 01.04-Input-Output-History.ipynb
│ ├── 01.05-IPython-And-Shell-Commands.ipynb
│ ├── 01.06-Errors-and-Debugging.ipynb
│ ├── 01.07-Timing-and-Profiling.ipynb
│ ├── 01.08-More-IPython-Resources.ipynb
│ ├── 02.00-Introduction-to-NumPy.ipynb
│ ├── 02.01-Understanding-Data-Types.ipynb
│ ├── 02.02-The-Basics-Of-NumPy-Arrays.ipynb
│ ├── 02.03-Computation-on-arrays-ufuncs.ipynb
│ ├── 02.04-Computation-on-arrays-aggregates.ipynb
│ ├── 02.05-Computation-on-arrays-broadcasting.ipynb
│ ├── 02.06-Boolean-Arrays-and-Masks.ipynb
│ ├── 02.07-Fancy-Indexing.ipynb
│ ├── 02.08-Sorting.ipynb
│ ├── 02.09-Structured-Data-NumPy.ipynb
│ ├── 03.00-Introduction-to-Pandas.ipynb
│ ├── 03.01-Introducing-Pandas-Objects.ipynb
│ ├── 03.02-Data-Indexing-and-Selection.ipynb
│ ├── 03.03-Operations-in-Pandas.ipynb
│ ├── 03.04-Missing-Values.ipynb
│ ├── 03.05-Hierarchical-Indexing.ipynb
│ ├── 03.06-Concat-And-Append.ipynb
│ ├── 03.07-Merge-and-Join.ipynb
│ ├── 03.08-Aggregation-and-Grouping.ipynb
│ ├── 03.09-Pivot-Tables.ipynb
│ ├── 03.10-Working-With-Strings.ipynb
│ ├── 03.11-Working-with-Time-Series.ipynb
│ ├── 03.12-Performance-Eval-and-Query.ipynb
│ ├── 03.13-Further-Resources.ipynb
│ ├── 04.00-Introduction-To-Matplotlib.ipynb
│ ├── 04.01-Simple-Line-Plots.ipynb
│ ├── 04.02-Simple-Scatter-Plots.ipynb
│ ├── 04.03-Errorbars.ipynb
│ ├── 04.04-Density-and-Contour-Plots.ipynb
│ ├── 04.05-Histograms-and-Binnings.ipynb
│ ├── 04.06-Customizing-Legends.ipynb
│ ├── 04.07-Customizing-Colorbars.ipynb
│ ├── 04.08-Multiple-Subplots.ipynb
│ ├── 04.09-Text-and-Annotation.ipynb
│ ├── 04.10-Customizing-Ticks.ipynb
│ ├── 04.11-Settings-and-Stylesheets.ipynb
│ ├── 04.12-Three-Dimensional-Plotting.ipynb
│ ├── 04.14-Visualization-With-Seaborn.ipynb
│ ├── 04.15-Further-Resources.ipynb
│ ├── 05.00-Machine-Learning.ipynb
│ ├── 05.01-What-Is-Machine-Learning.ipynb
│ ├── 05.02-Introducing-Scikit-Learn.ipynb
│ ├── 05.03-Hyperparameters-and-Model-Validation.ipynb
│ ├── 05.04-Feature-Engineering.ipynb
│ ├── 05.05-Naive-Bayes.ipynb
│ ├── 05.06-Linear-Regression.ipynb
│ ├── 05.07-Support-Vector-Machines.ipynb
│ ├── 05.08-Random-Forests.ipynb
│ ├── 05.09-Principal-Component-Analysis.ipynb
│ ├── 05.10-Manifold-Learning.ipynb
│ ├── 05.11-K-Means.ipynb
│ ├── 05.12-Gaussian-Mixtures.ipynb
│ ├── 05.13-Kernel-Density-Estimation.ipynb
│ ├── 05.14-Image-Features.ipynb
│ ├── 05.15-Learning-More.ipynb
│ ├── 06.00-Figure-Code.ipynb
│ ├── Untitled.ipynb
│ ├── data/
│ │ ├── births.csv
│ │ ├── president_heights.csv
│ │ ├── state-abbrevs.csv
│ │ ├── state-areas.csv
│ │ └── state-population.csv
│ └── helpers_05_08.py
├── notebooks_v1/
│ ├── 00.00-Preface.ipynb
│ ├── 01.00-IPython-Beyond-Normal-Python.ipynb
│ ├── 01.01-Help-And-Documentation.ipynb
│ ├── 01.02-Shell-Keyboard-Shortcuts.ipynb
│ ├── 01.03-Magic-Commands.ipynb
│ ├── 01.04-Input-Output-History.ipynb
│ ├── 01.05-IPython-And-Shell-Commands.ipynb
│ ├── 01.06-Errors-and-Debugging.ipynb
│ ├── 01.07-Timing-and-Profiling.ipynb
│ ├── 01.08-More-IPython-Resources.ipynb
│ ├── 02.00-Introduction-to-NumPy.ipynb
│ ├── 02.01-Understanding-Data-Types.ipynb
│ ├── 02.02-The-Basics-Of-NumPy-Arrays.ipynb
│ ├── 02.03-Computation-on-arrays-ufuncs.ipynb
│ ├── 02.04-Computation-on-arrays-aggregates.ipynb
│ ├── 02.05-Computation-on-arrays-broadcasting.ipynb
│ ├── 02.06-Boolean-Arrays-and-Masks.ipynb
│ ├── 02.07-Fancy-Indexing.ipynb
│ ├── 02.08-Sorting.ipynb
│ ├── 02.09-Structured-Data-NumPy.ipynb
│ ├── 03.00-Introduction-to-Pandas.ipynb
│ ├── 03.01-Introducing-Pandas-Objects.ipynb
│ ├── 03.02-Data-Indexing-and-Selection.ipynb
│ ├── 03.03-Operations-in-Pandas.ipynb
│ ├── 03.04-Missing-Values.ipynb
│ ├── 03.05-Hierarchical-Indexing.ipynb
│ ├── 03.06-Concat-And-Append.ipynb
│ ├── 03.07-Merge-and-Join.ipynb
│ ├── 03.08-Aggregation-and-Grouping.ipynb
│ ├── 03.09-Pivot-Tables.ipynb
│ ├── 03.10-Working-With-Strings.ipynb
│ ├── 03.11-Working-with-Time-Series.ipynb
│ ├── 03.12-Performance-Eval-and-Query.ipynb
│ ├── 03.13-Further-Resources.ipynb
│ ├── 04.00-Introduction-To-Matplotlib.ipynb
│ ├── 04.01-Simple-Line-Plots.ipynb
│ ├── 04.02-Simple-Scatter-Plots.ipynb
│ ├── 04.03-Errorbars.ipynb
│ ├── 04.04-Density-and-Contour-Plots.ipynb
│ ├── 04.05-Histograms-and-Binnings.ipynb
│ ├── 04.06-Customizing-Legends.ipynb
│ ├── 04.07-Customizing-Colorbars.ipynb
│ ├── 04.08-Multiple-Subplots.ipynb
│ ├── 04.09-Text-and-Annotation.ipynb
│ ├── 04.10-Customizing-Ticks.ipynb
│ ├── 04.11-Settings-and-Stylesheets.ipynb
│ ├── 04.12-Three-Dimensional-Plotting.ipynb
│ ├── 04.13-Geographic-Data-With-Basemap.ipynb
│ ├── 04.14-Visualization-With-Seaborn.ipynb
│ ├── 04.15-Further-Resources.ipynb
│ ├── 05.00-Machine-Learning.ipynb
│ ├── 05.01-What-Is-Machine-Learning.ipynb
│ ├── 05.02-Introducing-Scikit-Learn.ipynb
│ ├── 05.03-Hyperparameters-and-Model-Validation.ipynb
│ ├── 05.04-Feature-Engineering.ipynb
│ ├── 05.05-Naive-Bayes.ipynb
│ ├── 05.06-Linear-Regression.ipynb
│ ├── 05.07-Support-Vector-Machines.ipynb
│ ├── 05.08-Random-Forests.ipynb
│ ├── 05.09-Principal-Component-Analysis.ipynb
│ ├── 05.10-Manifold-Learning.ipynb
│ ├── 05.11-K-Means.ipynb
│ ├── 05.12-Gaussian-Mixtures.ipynb
│ ├── 05.13-Kernel-Density-Estimation.ipynb
│ ├── 05.14-Image-Features.ipynb
│ ├── 05.15-Learning-More.ipynb
│ ├── 06.00-Figure-Code.ipynb
│ ├── Index.ipynb
│ ├── Untitled.ipynb
│ ├── data/
│ │ ├── BicycleWeather.csv
│ │ ├── Seattle2014.csv
│ │ ├── births.csv
│ │ ├── california_cities.csv
│ │ ├── president_heights.csv
│ │ ├── state-abbrevs.csv
│ │ ├── state-areas.csv
│ │ └── state-population.csv
│ └── helpers_05_08.py
├── notebooks_v2/
│ └── data.csv
├── requirements.txt
├── tools/
│ ├── README.md
│ ├── add_book_info.py
│ ├── add_navigation.py
│ ├── fix_kernelspec.py
│ └── generate_contents.py
└── website/
├── .gitignore
├── Makefile
├── README.md
├── copy_notebooks.py
├── fabfile.py
├── pelicanconf.py
├── publishconf.py
└── theme/
├── README.md
├── static/
│ └── css/
│ └── icons.css
└── templates/
├── _includes/
│ ├── analytics.html
│ └── disqus_thread.html
├── about.html
├── archives.html
├── article.html
├── base.html
├── booksection.html
├── index.html
├── ipynb.css
├── main.css
├── main.less
├── page.html
├── pygments.css
└── tag.html
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# IPython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
.env
# virtualenv
venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject
# Emacs
*~
# Temporary data files
notebooks/recipeitems-latest.json
notebooks/FremontBridge.csv
notebooks/gistemp250.nc
notebooks/marathon-data.csv
notebooks/my_figure.png
notebooks/hello.png
================================================
FILE: .gitmodules
================================================
[submodule "website/plugins/ipynb"]
path = website/plugins/ipynb
url = git://github.com/danielfrg/pelican-ipynb.git
[submodule "website/plugins/pelican-plugins"]
path = website/plugins/pelican-plugins
url = git://github.com/getpelican/pelican-plugins.git
================================================
FILE: LICENSE-CODE
================================================
The MIT License (MIT)
Copyright (c) 2016 Jacob VanderPlas
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: LICENSE-TEXT
================================================
Creative Commons Legal Code
Attribution-NonCommercial-NoDerivs 3.0 Unported
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR
DAMAGES RESULTING FROM ITS USE.
License
THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE
COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY
COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS
AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.
BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE
TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY
BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS
CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND
CONDITIONS.
1. Definitions
a. "Adaptation" means a work based upon the Work, or upon the Work and
other pre-existing works, such as a translation, adaptation,
derivative work, arrangement of music or other alterations of a
literary or artistic work, or phonogram or performance and includes
cinematographic adaptations or any other form in which the Work may be
recast, transformed, or adapted including in any form recognizably
derived from the original, except that a work that constitutes a
Collection will not be considered an Adaptation for the purpose of
this License. For the avoidance of doubt, where the Work is a musical
work, performance or phonogram, the synchronization of the Work in
timed-relation with a moving image ("synching") will be considered an
Adaptation for the purpose of this License.
b. "Collection" means a collection of literary or artistic works, such as
encyclopedias and anthologies, or performances, phonograms or
broadcasts, or other works or subject matter other than works listed
in Section 1(f) below, which, by reason of the selection and
arrangement of their contents, constitute intellectual creations, in
which the Work is included in its entirety in unmodified form along
with one or more other contributions, each constituting separate and
independent works in themselves, which together are assembled into a
collective whole. A work that constitutes a Collection will not be
considered an Adaptation (as defined above) for the purposes of this
License.
c. "Distribute" means to make available to the public the original and
copies of the Work through sale or other transfer of ownership.
d. "Licensor" means the individual, individuals, entity or entities that
offer(s) the Work under the terms of this License.
e. "Original Author" means, in the case of a literary or artistic work,
the individual, individuals, entity or entities who created the Work
or if no individual or entity can be identified, the publisher; and in
addition (i) in the case of a performance the actors, singers,
musicians, dancers, and other persons who act, sing, deliver, declaim,
play in, interpret or otherwise perform literary or artistic works or
expressions of folklore; (ii) in the case of a phonogram the producer
being the person or legal entity who first fixes the sounds of a
performance or other sounds; and, (iii) in the case of broadcasts, the
organization that transmits the broadcast.
f. "Work" means the literary and/or artistic work offered under the terms
of this License including without limitation any production in the
literary, scientific and artistic domain, whatever may be the mode or
form of its expression including digital form, such as a book,
pamphlet and other writing; a lecture, address, sermon or other work
of the same nature; a dramatic or dramatico-musical work; a
choreographic work or entertainment in dumb show; a musical
composition with or without words; a cinematographic work to which are
assimilated works expressed by a process analogous to cinematography;
a work of drawing, painting, architecture, sculpture, engraving or
lithography; a photographic work to which are assimilated works
expressed by a process analogous to photography; a work of applied
art; an illustration, map, plan, sketch or three-dimensional work
relative to geography, topography, architecture or science; a
performance; a broadcast; a phonogram; a compilation of data to the
extent it is protected as a copyrightable work; or a work performed by
a variety or circus performer to the extent it is not otherwise
considered a literary or artistic work.
g. "You" means an individual or entity exercising rights under this
License who has not previously violated the terms of this License with
respect to the Work, or who has received express permission from the
Licensor to exercise rights under this License despite a previous
violation.
h. "Publicly Perform" means to perform public recitations of the Work and
to communicate to the public those public recitations, by any means or
process, including by wire or wireless means or public digital
performances; to make available to the public Works in such a way that
members of the public may access these Works from a place and at a
place individually chosen by them; to perform the Work to the public
by any means or process and the communication to the public of the
performances of the Work, including by public digital performance; to
broadcast and rebroadcast the Work by any means including signs,
sounds or images.
i. "Reproduce" means to make copies of the Work by any means including
without limitation by sound or visual recordings and the right of
fixation and reproducing fixations of the Work, including storage of a
protected performance or phonogram in digital form or other electronic
medium.
2. Fair Dealing Rights. Nothing in this License is intended to reduce,
limit, or restrict any uses free from copyright or rights arising from
limitations or exceptions that are provided for in connection with the
copyright protection under copyright law or other applicable laws.
3. License Grant. Subject to the terms and conditions of this License,
Licensor hereby grants You a worldwide, royalty-free, non-exclusive,
perpetual (for the duration of the applicable copyright) license to
exercise the rights in the Work as stated below:
a. to Reproduce the Work, to incorporate the Work into one or more
Collections, and to Reproduce the Work as incorporated in the
Collections; and,
b. to Distribute and Publicly Perform the Work including as incorporated
in Collections.
The above rights may be exercised in all media and formats whether now
known or hereafter devised. The above rights include the right to make
such modifications as are technically necessary to exercise the rights in
other media and formats, but otherwise you have no rights to make
Adaptations. Subject to 8(f), all rights not expressly granted by Licensor
are hereby reserved, including but not limited to the rights set forth in
Section 4(d).
4. Restrictions. The license granted in Section 3 above is expressly made
subject to and limited by the following restrictions:
a. You may Distribute or Publicly Perform the Work only under the terms
of this License. You must include a copy of, or the Uniform Resource
Identifier (URI) for, this License with every copy of the Work You
Distribute or Publicly Perform. You may not offer or impose any terms
on the Work that restrict the terms of this License or the ability of
the recipient of the Work to exercise the rights granted to that
recipient under the terms of the License. You may not sublicense the
Work. You must keep intact all notices that refer to this License and
to the disclaimer of warranties with every copy of the Work You
Distribute or Publicly Perform. When You Distribute or Publicly
Perform the Work, You may not impose any effective technological
measures on the Work that restrict the ability of a recipient of the
Work from You to exercise the rights granted to that recipient under
the terms of the License. This Section 4(a) applies to the Work as
incorporated in a Collection, but this does not require the Collection
apart from the Work itself to be made subject to the terms of this
License. If You create a Collection, upon notice from any Licensor You
must, to the extent practicable, remove from the Collection any credit
as required by Section 4(c), as requested.
b. You may not exercise any of the rights granted to You in Section 3
above in any manner that is primarily intended for or directed toward
commercial advantage or private monetary compensation. The exchange of
the Work for other copyrighted works by means of digital file-sharing
or otherwise shall not be considered to be intended for or directed
toward commercial advantage or private monetary compensation, provided
there is no payment of any monetary compensation in connection with
the exchange of copyrighted works.
c. If You Distribute, or Publicly Perform the Work or Collections, You
must, unless a request has been made pursuant to Section 4(a), keep
intact all copyright notices for the Work and provide, reasonable to
the medium or means You are utilizing: (i) the name of the Original
Author (or pseudonym, if applicable) if supplied, and/or if the
Original Author and/or Licensor designate another party or parties
(e.g., a sponsor institute, publishing entity, journal) for
attribution ("Attribution Parties") in Licensor's copyright notice,
terms of service or by other reasonable means, the name of such party
or parties; (ii) the title of the Work if supplied; (iii) to the
extent reasonably practicable, the URI, if any, that Licensor
specifies to be associated with the Work, unless such URI does not
refer to the copyright notice or licensing information for the Work.
The credit required by this Section 4(c) may be implemented in any
reasonable manner; provided, however, that in the case of a
Collection, at a minimum such credit will appear, if a credit for all
contributing authors of Collection appears, then as part of these
credits and in a manner at least as prominent as the credits for the
other contributing authors. For the avoidance of doubt, You may only
use the credit required by this Section for the purpose of attribution
in the manner set out above and, by exercising Your rights under this
License, You may not implicitly or explicitly assert or imply any
connection with, sponsorship or endorsement by the Original Author,
Licensor and/or Attribution Parties, as appropriate, of You or Your
use of the Work, without the separate, express prior written
permission of the Original Author, Licensor and/or Attribution
Parties.
d. For the avoidance of doubt:
i. Non-waivable Compulsory License Schemes. In those jurisdictions in
which the right to collect royalties through any statutory or
compulsory licensing scheme cannot be waived, the Licensor
reserves the exclusive right to collect such royalties for any
exercise by You of the rights granted under this License;
ii. Waivable Compulsory License Schemes. In those jurisdictions in
which the right to collect royalties through any statutory or
compulsory licensing scheme can be waived, the Licensor reserves
the exclusive right to collect such royalties for any exercise by
You of the rights granted under this License if Your exercise of
such rights is for a purpose or use which is otherwise than
noncommercial as permitted under Section 4(b) and otherwise waives
the right to collect royalties through any statutory or compulsory
licensing scheme; and,
iii. Voluntary License Schemes. The Licensor reserves the right to
collect royalties, whether individually or, in the event that the
Licensor is a member of a collecting society that administers
voluntary licensing schemes, via that society, from any exercise
by You of the rights granted under this License that is for a
purpose or use which is otherwise than noncommercial as permitted
under Section 4(b).
e. Except as otherwise agreed in writing by the Licensor or as may be
otherwise permitted by applicable law, if You Reproduce, Distribute or
Publicly Perform the Work either by itself or as part of any
Collections, You must not distort, mutilate, modify or take other
derogatory action in relation to the Work which would be prejudicial
to the Original Author's honor or reputation.
5. Representations, Warranties and Disclaimer
UNLESS OTHERWISE MUTUALLY AGREED BY THE PARTIES IN WRITING, LICENSOR
OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY
KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE,
INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY,
FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF
LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS,
WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION
OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.
6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE
LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR
ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES
ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS
BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
7. Termination
a. This License and the rights granted hereunder will terminate
automatically upon any breach by You of the terms of this License.
Individuals or entities who have received Collections from You under
this License, however, will not have their licenses terminated
provided such individuals or entities remain in full compliance with
those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any
termination of this License.
b. Subject to the above terms and conditions, the license granted here is
perpetual (for the duration of the applicable copyright in the Work).
Notwithstanding the above, Licensor reserves the right to release the
Work under different license terms or to stop distributing the Work at
any time; provided, however that any such election will not serve to
withdraw this License (or any other license that has been, or is
required to be, granted under the terms of this License), and this
License will continue in full force and effect unless terminated as
stated above.
8. Miscellaneous
a. Each time You Distribute or Publicly Perform the Work or a Collection,
the Licensor offers to the recipient a license to the Work on the same
terms and conditions as the license granted to You under this License.
b. If any provision of this License is invalid or unenforceable under
applicable law, it shall not affect the validity or enforceability of
the remainder of the terms of this License, and without further action
by the parties to this agreement, such provision shall be reformed to
the minimum extent necessary to make such provision valid and
enforceable.
c. No term or provision of this License shall be deemed waived and no
breach consented to unless such waiver or consent shall be in writing
and signed by the party to be charged with such waiver or consent.
d. This License constitutes the entire agreement between the parties with
respect to the Work licensed here. There are no understandings,
agreements or representations with respect to the Work not specified
here. Licensor shall not be bound by any additional provisions that
may appear in any communication from You. This License may not be
modified without the mutual written agreement of the Licensor and You.
e. The rights granted under, and the subject matter referenced, in this
License were drafted utilizing the terminology of the Berne Convention
for the Protection of Literary and Artistic Works (as amended on
September 28, 1979), the Rome Convention of 1961, the WIPO Copyright
Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996
and the Universal Copyright Convention (as revised on July 24, 1971).
These rights and subject matter take effect in the relevant
jurisdiction in which the License terms are sought to be enforced
according to the corresponding provisions of the implementation of
those treaty provisions in the applicable national law. If the
standard suite of rights granted under applicable copyright law
includes additional rights not granted under this License, such
additional rights are deemed to be included in the License; this
License is not intended to restrict the license of any rights under
applicable law.
Creative Commons Notice
Creative Commons is not a party to this License, and makes no warranty
whatsoever in connection with the Work. Creative Commons will not be
liable to You or any party on any legal theory for any damages
whatsoever, including without limitation any general, special,
incidental or consequential damages arising in connection to this
license. Notwithstanding the foregoing two (2) sentences, if Creative
Commons has expressly identified itself as the Licensor hereunder, it
shall have all rights and obligations of Licensor.
Except for the limited purpose of indicating to the public that the
Work is licensed under the CCPL, Creative Commons does not authorize
the use by either party of the trademark "Creative Commons" or any
related trademark or logo of Creative Commons without the prior
written consent of Creative Commons. Any permitted use will be in
compliance with Creative Commons' then-current trademark usage
guidelines, as may be published on its website or otherwise made
available upon request from time to time. For the avoidance of doubt,
this trademark restriction does not form part of this License.
Creative Commons may be contacted at https://creativecommons.org/.
================================================
FILE: README.md
================================================
# Python Data Science Handbook
[](https://mybinder.org/v2/gh/jakevdp/PythonDataScienceHandbook/master?filepath=notebooks%2FIndex.ipynb)
[](https://colab.research.google.com/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/Index.ipynb)
This repository contains the entire [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do), in the form of (free!) Jupyter notebooks.

## How to Use this Book
- Read the book in its entirety online at https://jakevdp.github.io/PythonDataScienceHandbook/
- Run the code using the Jupyter notebooks available in this repository's [notebooks](notebooks) directory.
- Launch executable versions of these notebooks using [Google Colab](http://colab.research.google.com): [](https://colab.research.google.com/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/Index.ipynb)
- Launch a live notebook server with these notebooks using [binder](https://beta.mybinder.org/): [](https://mybinder.org/v2/gh/jakevdp/PythonDataScienceHandbook/master?filepath=notebooks%2FIndex.ipynb)
- Buy the printed book through [O'Reilly Media](http://shop.oreilly.com/product/0636920034919.do)
## About
The book was written and tested with Python 3.5, though other Python versions (including Python 2.7) should work in nearly all cases.
The book introduces the core libraries essential for working with data in Python: particularly [IPython](http://ipython.org), [NumPy](http://numpy.org), [Pandas](http://pandas.pydata.org), [Matplotlib](http://matplotlib.org), [Scikit-Learn](http://scikit-learn.org), and related packages.
Familiarity with Python as a language is assumed; if you need a quick introduction to the language itself, see the free companion project,
[A Whirlwind Tour of Python](https://github.com/jakevdp/WhirlwindTourOfPython): it's a fast-paced introduction to the Python language aimed at researchers and scientists.
See [Index.ipynb](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/Index.ipynb) for an index of the notebooks available to accompany the text.
## Software
The code in the book was tested with Python 3.5, though most (but not all) will also work correctly with Python 2.7 and other older Python versions.
The packages I used to run the code in the book are listed in [requirements.txt](requirements.txt) (Note that some of these exact version numbers may not be available on your platform: you may have to tweak them for your own use).
To install the requirements using [conda](http://conda.pydata.org), run the following at the command-line:
```
$ conda install --file requirements.txt
```
To create a stand-alone environment named ``PDSH`` with Python 3.5 and all the required package versions, run the following:
```
$ conda create -n PDSH python=3.5 --file requirements.txt
```
You can read more about using conda environments in the [Managing Environments](http://conda.pydata.org/docs/using/envs.html) section of the conda documentation.
## License
### Code
The code in this repository, including all code samples in the notebooks listed above, is released under the [MIT license](LICENSE-CODE). Read more at the [Open Source Initiative](https://opensource.org/licenses/MIT).
### Text
The text content of the book is released under the [CC-BY-NC-ND license](LICENSE-TEXT). Read more at [Creative Commons](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode).
================================================
FILE: environment.yml
================================================
name: data-science-handbook
channels:
- conda-forge
dependencies:
- python=3.5
- pip:
- -r requirements.txt
================================================
FILE: notebooks/00.00-Preface.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Preface"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## What Is Data Science?\n",
"\n",
"This is a book about doing data science with Python, which immediately begs the question: what is *data science*?\n",
"It's a surprisingly hard definition to nail down, especially given how ubiquitous the term has become.\n",
"Vocal critics have variously dismissed it as a superfluous label (after all, what science doesn't involve data?) or a simple buzzword that only exists to salt resumes and catch the eye of overzealous tech recruiters.\n",
"\n",
"In my mind, these critiques miss something important.\n",
"Data science, despite its hype-laden veneer, is perhaps the best label we have for the cross-disciplinary set of skills that are becoming increasingly important in many applications across industry and academia.\n",
"This *cross-disciplinary* piece is key: in my mind, the best existing definition of data science is illustrated by Drew Conway's Data Science Venn Diagram, first published on his blog in September 2010 (see the following figure)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"\n",
"<small>(source: [Drew Conway](http://drewconway.com/zia/2013/3/26/the-data-science-venn-diagram), used by permission)</small>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"While some of the intersection labels are a bit tongue-in-cheek, this diagram captures the essence of what I think people mean when they say \"data science\": it is fundamentally an interdisciplinary subject.\n",
"Data science comprises three distinct and overlapping areas: the skills of a *statistician* who knows how to model and summarize datasets (which are growing ever larger); the skills of a *computer scientist* who can design and use algorithms to efficiently store, process, and visualize this data; and the *domain expertise*—what we might think of as \"classical\" training in a subject—necessary both to formulate the right questions and to put their answers in context.\n",
"\n",
"With this in mind, I would encourage you to think of data science not as a new domain of knowledge to learn, but a new set of skills that you can apply within your current area of expertise.\n",
"Whether you are reporting election results, forecasting stock returns, optimizing online ad clicks, identifying microorganisms in microscope photos, seeking new classes of astronomical objects, or working with data in any other field, the goal of this book is to give you the ability to ask and answer new questions about your chosen subject area."
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Who Is This Book For?\n",
"\n",
"In my teaching both at the University of Washington and at various tech-focused conferences and meetups, one of the most common questions I have heard is this: \"How should I learn Python?\"\n",
"The people asking are generally technically minded students, developers, or researchers, often with an already strong background in writing code and using computational and numerical tools.\n",
"Most of these folks don't want to learn Python per se, but want to learn the language with the aim of using it as a tool for data-intensive and computational science.\n",
"While a large patchwork of videos, blog posts, and tutorials for this audience is available online, I've long been frustrated by the lack of a single good answer to this question; that is what inspired this book.\n",
"\n",
"The book is not meant to be an introduction to Python or to programming in general; I assume the reader has familiarity with the Python language, including defining functions, assigning variables, calling methods of objects, controlling the flow of a program, and other basic tasks.\n",
"Instead, it is meant to help Python users learn to use Python's data science stack—libraries such as those mentioned in the following section, and related tools—to effectively store, manipulate, and gain insight from data."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Why Python?\n",
"\n",
"Python has emerged over the last couple of decades as a first-class tool for scientific computing tasks, including the analysis and visualization of large datasets.\n",
"This may have come as a surprise to early proponents of the Python language: the language itself was not specifically designed with data analysis or scientific computing in mind.\n",
"The usefulness of Python for data science stems primarily from the large and active ecosystem of third-party packages: *NumPy* for manipulation of homogeneous array-based data, *Pandas* for manipulation of heterogeneous and labeled data, *SciPy* for common scientific computing tasks, *Matplotlib* for publication-quality visualizations, *IPython* for interactive execution and sharing of code, *Scikit-Learn* for machine learning, and many more tools that will be mentioned in the following pages.\n",
"\n",
"If you are looking for a guide to the Python language itself, I would suggest the sister project to this book, [https://www.oreilly.com/library/view/a-whirlwind-tour/9781492037859](_A Whirlwind Tour of the Python Language_).\n",
"This short report provides a tour of the essential features of the Python language, aimed at data scientists who already are familiar with one or more other programming languages."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Outline of the Book\n",
"\n",
"Each numbered part of this book focuses on a particular package or tool that contributes a fundamental piece of the Python data science story, and is broken into short self-contained chapters that each discuss a single concept:\n",
"\n",
"- *Part I, Jupyter: Beyond Normal Python*, introduces IPython and Jupyter. These packages provide the computational environment in which many Python-using data scientists work.\n",
"- *Part II, Introduction to NumPy*, focuses on the NumPy library, which provides the `ndarray` for efficient storage and manipulation of dense data arrays in Python.\n",
"- *Part III, Data Manipulation with Pandas*, introduces the Pandas library, which provides the `DataFrame` for efficient storage and manipulation of labeled/columnar data in Python.\n",
"- *Part IV, Visualization with Matplotlib*, concentrates on Matplotlib, a library that provides capabilities for a flexible range of data visualizations in Python.\n",
"- *Part V, Machine Learning*, focuses on the Scikit-Learn library, which provides efficient and clean Python implementations of the most important and established machine learning algorithms.\n",
"\n",
"The PyData world is certainly much larger than these six packages, and is growing every day.\n",
"With this in mind, I make every attempt throughout this book to provide references to other interesting efforts, projects, and packages that are pushing the boundaries of what can be done in Python.\n",
"Nevertheless, the packages I concentrate on are currently fundamental to much of the work being done in the Python data science space, and I expect they will remain important even as the ecosystem continues growing around them."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Using Code Examples\n",
"\n",
"Supplemental material (code examples, figures, etc.) is available for download at http://github.com/jakevdp/PythonDataScienceHandbook/. This book is here to help you get your job done. In general, if example code is offered with this book, you may use it in your programs and documentation. You do not need to contact us for permission unless you’re reproducing a significant portion of the code. For example, writing a program that uses several chunks of code from this book does not require permission. Selling or distributing a CD-ROM of examples from O’Reilly books does require permission. Answering a question by citing this book and quoting example code does not require permission. Incorporating a significant amount of example code from this book into your product’s documentation does require permission.\n",
"\n",
"We appreciate, but do not require, attribution. An attribution usually includes the title, author, publisher, and ISBN. For example: \"*Python Data Science Handbook*, 2nd edition, by Jake VanderPlas (O’Reilly). Copyright 2023 Jake VanderPlas, 978-1-098-12122-8.\"\n",
"\n",
"If you feel your use of code examples falls outside fair use or the permission given above, feel free to contact us at permissions@oreilly.com."
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Installation Considerations\n",
"\n",
"Installing Python and the suite of libraries that enable scientific computing is straightforward. This section will outline some of the things to keep in mind when setting up your computer.\n",
"\n",
"Though there are various ways to install Python, the one I would suggest for use in data science is the Anaconda distribution, which works similarly whether you use Windows, Linux, or macOS.\n",
"The Anaconda distribution comes in two flavors:\n",
"\n",
"- [Miniconda](http://conda.pydata.org/miniconda.html) gives you the Python interpreter itself, along with a command-line tool called *conda* which operates as a cross-platform package manager geared toward Python packages, similar in spirit to the apt or yum tools that Linux users might be familiar with.\n",
"\n",
"- [Anaconda](https://www.continuum.io/downloads) includes both Python and conda, and additionally bundles a suite of other preinstalled packages geared toward scientific computing. Because of the size of this bundle, expect the installation to consume several gigabytes of disk space.\n",
"\n",
"Any of the packages included with Anaconda can also be installed manually on top of Miniconda; for this reason I suggest starting with Miniconda.\n",
"\n",
"To get started, download and install the Miniconda package—make sure to choose a version with Python 3—and then install the core packages used in this book:\n",
"\n",
"```\n",
"[~]$ conda install numpy pandas scikit-learn matplotlib seaborn jupyter\n",
"```\n",
"\n",
"Throughout the text, we will also make use of other more specialized tools in Python's scientific ecosystem; installation is usually as easy as typing **`conda install packagename`**.\n",
"If you ever come across packages that are not available in the default conda channel, be sure to check out [*conda-forge*](https://conda-forge.org/), a broad, community-driven repository of conda packages.\n",
"\n",
"For more information on conda, including information about creating and using conda environments (which I would *highly* recommend), refer to [conda's online documentation](http://conda.pydata.org/docs/)."
]
}
],
"metadata": {
"anaconda-cloud": {},
"jupytext": {
"formats": "ipynb,md"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: notebooks/01.00-IPython-Beyond-Normal-Python.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Jupyter: Beyond Normal Python"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"There are many options for development environments for Python, and I'm often asked which one I use in my own work.\n",
"My answer sometimes surprises people: my preferred environment is [IPython](http://ipython.org/) plus a text editor (in my case, Emacs or VSCode depending on my mood).\n",
"Jupyter got its start as the IPython shell, which was created in 2001 by Fernando Perez as an enhanced Python interpreter and has since grown into a project aiming to provide, in Perez's words, \"Tools for the entire life cycle of research computing.\"\n",
"If Python is the engine of our data science task, you might think of Jupyter as the interactive control panel.\n",
"\n",
"As well as being a useful interactive interface to Python, Jupyter also provides a number of useful syntactic additions to the language; we'll cover the most useful of these additions here.\n",
"Perhaps the most familiar interface provided by the Jupyter project is the Jupyter Notebook, a browser-based environment that is useful for development, collaboration, sharing, and even publication of data science results.\n",
"As an example of the usefulness of the notebook format, look no further than the page you are reading: the entire manuscript for this book was composed as a set of Jupyter notebooks.\n",
"\n",
"This part of the book will start by stepping through some of the Jupyter and IPython features that are useful to the practice of data science, focusing especially on the syntax they offer beyond the standard features of Python.\n",
"Next, we will go into a bit more depth on some of the more useful *magic commands* that can speed up common tasks in creating and using data science code.\n",
"Finally, we will touch on some of the features of the notebook that make it useful for understanding data and sharing results."
]
}
],
"metadata": {
"anaconda-cloud": {},
"jupytext": {
"formats": "ipynb,md"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: notebooks/01.01-Help-And-Documentation.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Getting Started in IPython and Jupyter\n",
"\n",
"In writing Python code for data science, I generally go between three modes of working: I use the IPython shell for trying out short sequences of commands, the Jupyter Notebook for longer interactive analysis and for sharing content with others, and interactive development environments (IDEs) like Emacs or VSCode for creating reusable Python packages.\n",
"This chapter focuses on the first two modes: the IPython shell and the Jupyter Notebook.\n",
"Use of an IDE for software development is an important third tool in the data scientist's repertoire, but we will not directly address that here."
]
},
{
"cell_type": "markdown",
"id": "7b582097",
"metadata": {},
"source": [
"## Launching the IPython Shell\n",
"\n",
"The text in this part, like most of this book, is not designed to be absorbed passively.\n",
"I recommend that as you read through it, you follow along and experiment with the tools and syntax we cover: the muscle memory you build through doing this will be far more useful than the simple act of reading about it.\n",
"Start by launching the IPython interpreter by typing **`ipython`** on the command line; alternatively, if you've installed a distribution like Anaconda or EPD, there may be a launcher specific to your system (we'll discuss this more fully in [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb)).\n",
"\n",
"Once you do this, you should see a prompt like the following:\n",
"\n",
"```ipython\n",
"Python 3.9.2 (v3.9.2:1a79785e3e, Feb 19 2021, 09:06:10) \n",
"Type 'copyright', 'credits' or 'license' for more information\n",
"IPython 7.21.0 -- An enhanced Interactive Python. Type '?' for help.\n",
"\n",
"In [1]:\n",
"```\n",
"With that, you're ready to follow along."
]
},
{
"cell_type": "markdown",
"id": "d1d2d0fb",
"metadata": {},
"source": [
"## Launching the Jupyter Notebook\n",
"\n",
"The Jupyter Notebook is a browser-based graphical interface to the IPython shell, and builds on it a rich set of dynamic display capabilities.\n",
"As well as executing Python/IPython statements, notebooks allow the user to include formatted text, static and dynamic visualizations, mathematical equations, JavaScript widgets, and much more.\n",
"Furthermore, these documents can be saved in a way that lets other people open them and execute the code on their own systems.\n",
"\n",
"Though you'll view and edit Jupyter notebooks through your web browser window, they must connect to a running Python process in order to execute code.\n",
"You can start this process (known as a \"kernel\") by running the following command in your system shell:\n",
"\n",
"```\n",
"$ jupyter lab\n",
"```\n",
"\n",
"This command will launch a local web server that will be visible to your browser.\n",
"It immediately spits out a log showing what it is doing; that log will look something like this:\n",
"\n",
"```\n",
"$ jupyter lab\n",
"[ServerApp] Serving notebooks from local directory: /Users/jakevdp/PythonDataScienceHandbook\n",
"[ServerApp] Jupyter Server 1.4.1 is running at:\n",
"[ServerApp] http://localhost:8888/lab?token=dd852649\n",
"[ServerApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).\n",
"```\n",
"\n",
"Upon issuing the command, your default browser should automatically open and navigate to the listed local URL;\n",
"the exact address will depend on your system.\n",
"If the browser does not open automatically, you can open a window and manually open this address (*http://localhost:8888/lab/* in this example)."
]
},
{
"cell_type": "markdown",
"id": "92286db8",
"metadata": {},
"source": [
"## Help and Documentation in IPython"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you read no other section in this chapter, read this one: I find the tools discussed here to be the most transformative contributions of IPython to my daily workflow.\n",
"\n",
"When a technologically minded person is asked to help a friend, family member, or colleague with a computer problem, most of the time it's less a matter of knowing the answer than of knowing how to quickly find an unknown answer.\n",
"In data science it's the same: searchable web resources such as online documentation, mailing list threads, and Stack Overflow answers contain a wealth of information, even (especially?) about topics you've found yourself searching on before.\n",
"Being an effective practitioner of data science is less about memorizing the tool or command you should use for every possible situation, and more about learning to effectively find the information you don't know, whether through a web search engine or another means.\n",
"\n",
"One of the most useful functions of IPython/Jupyter is to shorten the gap between the user and the type of documentation and search that will help them do their work effectively.\n",
"While web searches still play a role in answering complicated questions, an amazing amount of information can be found through IPython alone.\n",
"Some examples of the questions IPython can help answer in a few keystrokes include:\n",
"\n",
"- How do I call this function? What arguments and options does it have?\n",
"- What does the source code of this Python object look like?\n",
"- What is in this package I imported? \n",
"- What attributes or methods does this object have?\n",
"\n",
"Here we'll discuss the tools provided in the IPython shell and Jupyter Notebook to quickly access this information, namely the `?` character to explore documentation, the `??` characters to explore source code, and the Tab key for autocompletion."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Accessing Documentation with ?\n",
"\n",
"The Python language and its data science ecosystem are built with the user in mind, and one big part of that is access to documentation.\n",
"Every Python object contains a reference to a string, known as a *docstring*, which in most cases will contain a concise summary of the object and how to use it.\n",
"Python has a built-in `help` function that can access this information and prints the results.\n",
"For example, to see the documentation of the built-in `len` function, you can do the following:\n",
"\n",
"```ipython\n",
"In [1]: help(len)\n",
"Help on built-in function len in module builtins:\n",
"\n",
"len(obj, /)\n",
" Return the number of items in a container.\n",
"```\n",
"\n",
"Depending on your interpreter, this information may be displayed as inline text or in a separate pop-up window."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Because finding help on an object is so common and useful, IPython and Jupyter introduce the `?` character as a shorthand for accessing this documentation and other relevant information:\n",
"\n",
"```ipython\n",
"In [2]: len?\n",
"Signature: len(obj, /)\n",
"Docstring: Return the number of items in a container.\n",
"Type: builtin_function_or_method\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notation works for just about anything, including object methods:\n",
"\n",
"```ipython\n",
"In [3]: L = [1, 2, 3]\n",
"In [4]: L.insert?\n",
"Signature: L.insert(index, object, /)\n",
"Docstring: Insert object before index.\n",
"Type: builtin_function_or_method\n",
"```\n",
"\n",
"or even objects themselves, with the documentation from their type:\n",
"\n",
"```ipython\n",
"In [5]: L?\n",
"Type: list\n",
"String form: [1, 2, 3]\n",
"Length: 3\n",
"Docstring: \n",
"Built-in mutable sequence.\n",
"\n",
"If no argument is given, the constructor creates a new empty list.\n",
"The argument must be an iterable if specified.\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Importantly, this will even work for functions or other objects you create yourself!\n",
"Here we'll define a small function with a docstring:\n",
"\n",
"```ipython\n",
"In [6]: def square(a):\n",
" ....: \"\"\"Return the square of a.\"\"\"\n",
" ....: return a ** 2\n",
" ....:\n",
"```\n",
"\n",
"Note that to create a docstring for our function, we simply placed a string literal in the first line.\n",
"Because docstrings are usually multiple lines, by convention we used Python's triple-quote notation for multiline strings."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we'll use the `?` to find this docstring:\n",
"\n",
"```ipython\n",
"In [7]: square?\n",
"Signature: square(a)\n",
"Docstring: Return the square of a.\n",
"File: <ipython-input-6>\n",
"Type: function\n",
"```\n",
"\n",
"This quick access to documentation via docstrings is one reason you should get in the habit of always adding such inline documentation to the code you write!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Accessing Source Code with ??\n",
"\n",
"Because the Python language is so easily readable, another level of insight can usually be gained by reading the source code of the object you're curious about.\n",
"IPython and Jupyter provide a shortcut to the source code with the double question mark (`??`):\n",
"\n",
"```ipython\n",
"In [8]: square??\n",
"Signature: square(a)\n",
"Source: \n",
"def square(a):\n",
" \"\"\"Return the square of a.\"\"\"\n",
" return a ** 2\n",
"File: <ipython-input-6>\n",
"Type: function\n",
"```\n",
"\n",
"For simple functions like this, the double question mark can give quick insight into the under-the-hood details."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you play with this much, you'll notice that sometimes the `??` suffix doesn't display any source code: this is generally because the object in question is not implemented in Python, but in C or some other compiled extension language.\n",
"If this is the case, the `??` suffix gives the same output as the `?` suffix.\n",
"You'll find this particularly with many of Python's built-in objects and types, including the `len` function from earlier:\n",
"\n",
"```ipython\n",
"In [9]: len??\n",
"Signature: len(obj, /)\n",
"Docstring: Return the number of items in a container.\n",
"Type: builtin_function_or_method\n",
"```\n",
"\n",
"Using `?` and/or `??` is a powerful and quick way of finding information about what any Python function or module does."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Exploring Modules with Tab Completion\n",
"\n",
"Another useful interface is the use of the Tab key for autocompletion and exploration of the contents of objects, modules, and namespaces.\n",
"In the examples that follow, I'll use `<TAB>` to indicate when the Tab key should be pressed."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Tab completion of object contents\n",
"\n",
"Every Python object has various attributes and methods associated with it.\n",
"Like the `help` function mentioned earlier, Python has a built-in `dir` function that returns a list of these, but the tab-completion interface is much easier to use in practice.\n",
"To see a list of all available attributes of an object, you can type the name of the object followed by a period (\"`.`\") character and the Tab key:\n",
"\n",
"```ipython\n",
"In [10]: L.<TAB>\n",
" append() count insert reverse \n",
" clear extend pop sort \n",
" copy index remove \n",
"```\n",
"\n",
"To narrow down the list, you can type the first character or several characters of the name, and the Tab key will find the matching attributes and methods:\n",
"\n",
"```ipython\n",
"In [10]: L.c<TAB>\n",
" clear() count()\n",
" copy() \n",
"\n",
"In [10]: L.co<TAB>\n",
" copy() count()\n",
"```\n",
"\n",
"If there is only a single option, pressing the Tab key will complete the line for you.\n",
"For example, the following will instantly be replaced with `L.count`:\n",
"\n",
"```ipython\n",
"In [10]: L.cou<TAB>\n",
"\n",
"```\n",
"\n",
"Though Python has no strictly enforced distinction between public/external attributes and private/internal attributes, by convention a preceding underscore is used to denote the latter.\n",
"For clarity, these private methods and special methods are omitted from the list by default, but it's possible to list them by explicitly typing the underscore:\n",
"\n",
"```ipython\n",
"In [10]: L._<TAB>\n",
" __add__ __delattr__ __eq__ \n",
" __class__ __delitem__ __format__()\n",
" __class_getitem__() __dir__() __ge__ >\n",
" __contains__ __doc__ __getattribute__ \n",
"```\n",
"\n",
"For brevity, I've only shown the first few columns of the output.\n",
"Most of these are Python's special double-underscore methods (often nicknamed \"dunder\" methods)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Tab completion when importing\n",
"\n",
"Tab completion is also useful when importing objects from packages.\n",
"Here we'll use it to find all possible imports in the `itertools` package that start with `co`:\n",
"\n",
"```ipython\n",
"In [10]: from itertools import co<TAB>\n",
" combinations() compress()\n",
" combinations_with_replacement() count()\n",
"```\n",
"\n",
"Similarly, you can use tab-completion to see which imports are available on your system (this will change depending on which third-party scripts and modules are visible to your Python session):\n",
"\n",
"```ipython\n",
"In [10]: import <TAB>\n",
" abc anyio \n",
" activate_this appdirs \n",
" aifc appnope >\n",
" antigravity argon2 \n",
"\n",
"In [10]: import h<TAB>\n",
" hashlib html \n",
" heapq http \n",
" hmac \n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Beyond tab completion: Wildcard matching\n",
"\n",
"Tab completion is useful if you know the first few characters of the name of the object or attribute you're looking for, but is little help if you'd like to match characters in the middle or at the end of the name.\n",
"For this use case, IPython and Jupyter provide a means of wildcard matching for names using the `*` character.\n",
"\n",
"For example, we can use this to list every object in the namespace whose name ends with `Warning`:\n",
"\n",
"```ipython\n",
"In [10]: *Warning?\n",
"BytesWarning RuntimeWarning\n",
"DeprecationWarning SyntaxWarning\n",
"FutureWarning UnicodeWarning\n",
"ImportWarning UserWarning\n",
"PendingDeprecationWarning Warning\n",
"ResourceWarning\n",
"```\n",
"\n",
"Notice that the `*` character matches any string, including the empty string.\n",
"\n",
"Similarly, suppose we are looking for a string method that contains the word `find` somewhere in its name.\n",
"We can search for it this way:\n",
"\n",
"```ipython\n",
"In [11]: str.*find*?\n",
"str.find\n",
"str.rfind\n",
"```\n",
"\n",
"I find this type of flexible wildcard search can be useful for finding a particular command when getting to know a new package or reacquainting myself with a familiar one."
]
}
],
"metadata": {
"anaconda-cloud": {},
"jupytext": {
"formats": "ipynb,md"
},
"kernelspec": {
"display_name": "Python 3.9.6 64-bit ('3.9.6')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
},
"vscode": {
"interpreter": {
"hash": "513788764cd0ec0f97313d5418a13e1ea666d16d72f976a8acadce25a5af2ffc"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: notebooks/01.02-Shell-Keyboard-Shortcuts.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Keyboard Shortcuts in the IPython Shell"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you spend any amount of time on a computer, you've probably found a use for keyboard shortcuts in your workflow.\n",
"Most familiar perhaps are Cmd-c and Cmd-v (or Ctrl-c and Ctrl-v), used for copying and pasting in a wide variety of programs and systems.\n",
"Power users tend to go even further: popular text editors like Emacs, Vim, and others provide users an incredible range of operations through intricate combinations of keystrokes.\n",
"\n",
"The IPython shell doesn't go this far, but does provide a number of keyboard shortcuts for fast navigation while typing commands.\n",
"While some of these shortcuts do work in the browser-based notebooks, this section is primarily about shortcuts in the IPython shell.\n",
"\n",
"Once you get accustomed to these, they can be very useful for quickly performing certain commands without moving your hands from the \"home\" keyboard position.\n",
"If you're an Emacs user or if you have experience with Linux-style shells, the following will be very familiar.\n",
"I'll group these shortcuts into a few categories: *navigation shortcuts*, *text entry shortcuts*, *command history shortcuts*, and *miscellaneous shortcuts*."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Navigation Shortcuts\n",
"\n",
"While the use of the left and right arrow keys to move backward and forward in the line is quite obvious, there are other options that don't require moving your hands from the \"home\" keyboard position:\n",
"\n",
"| Keystroke | Action |\n",
"|---------------------------------|--------------------------------------------|\n",
"| Ctrl-a | Move cursor to beginning of line |\n",
"| Ctrl-e | Move cursor to end of the line |\n",
"| Ctrl-b or the left arrow key | Move cursor back one character |\n",
"| Ctrl-f or the right arrow key | Move cursor forward one character |"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Text Entry Shortcuts\n",
"\n",
"While everyone is familiar with using the Backspace key to delete the previous character, reaching for the key often requires some minor finger gymnastics, and it only deletes a single character at a time.\n",
"In IPython there are several shortcuts for removing some portion of the text you're typing; the most immediately useful of these are the commands to delete entire lines of text.\n",
"You'll know these have become second-nature if you find yourself using a combination of Ctrl-b and Ctrl-d instead of reaching for Backspace to delete the previous character!\n",
"\n",
"| Keystroke | Action |\n",
"|-----------------------------|--------------------------------------------------|\n",
"| Backspace key | Delete previous character in line |\n",
"| Ctrl-d | Delete next character in line |\n",
"| Ctrl-k | Cut text from cursor to end of line |\n",
"| Ctrl-u | Cut text from beginning of line to cursor |\n",
"| Ctrl-y | Yank (i.e., paste) text that was previously cut |\n",
"| Ctrl-t | Transpose (i.e., switch) previous two characters |"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Command History Shortcuts\n",
"\n",
"Perhaps the most impactful shortcuts discussed here are the ones IPython provides for navigating the command history.\n",
"This command history goes beyond your current IPython session: your entire command history is stored in a SQLite database in your IPython profile directory.\n",
"The most straightforward way to access previous commands is by using the up and down arrow keys to step through the history, but other options exist as well:\n",
"\n",
"| Keystroke | Action |\n",
"|-----------------------------------|--------------------------------------------|\n",
"| Ctrl-p (or the up arrow key) | Access previous command in history |\n",
"| Ctrl-n (or the down arrow key) | Access next command in history |\n",
"| Ctrl-r | Reverse-search through command history |"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The reverse-search option can be particularly useful.\n",
"Recall that earlier we defined a function called `square`.\n",
"Let's reverse-search our Python history from a new IPython shell and find this definition again.\n",
"When you press Ctrl-r in the IPython terminal, you'll see the following prompt:\n",
"\n",
"```ipython\n",
"In [1]:\n",
"(reverse-i-search)`': \n",
"```\n",
"\n",
"If you start typing characters at this prompt, IPython will autofill the most recent command, if any, that matches those characters:\n",
"\n",
"```ipython\n",
"In [1]: \n",
"(reverse-i-search)`sqa': square??\n",
"```\n",
"\n",
"At any point, you can add more characters to refine the search, or press Ctrl-r again to search further for another command that matches the query. If you followed along earlier, pressing Ctrl-r twice more gives:\n",
"\n",
"```ipython\n",
"In [1]: \n",
"(reverse-i-search)`sqa': def square(a):\n",
" \"\"\"Return the square of a\"\"\"\n",
" return a ** 2\n",
"```\n",
"\n",
"Once you have found the command you're looking for, press Return and the search will end.\n",
"You can then use the retrieved command and carry on with your session:\n",
"\n",
"```ipython\n",
"In [1]: def square(a):\n",
" \"\"\"Return the square of a\"\"\"\n",
" return a ** 2\n",
"\n",
"In [2]: square(2)\n",
"Out[2]: 4\n",
"```\n",
"\n",
"Note that you can use Ctrl-p/Ctrl-n or the up/down arrow keys to search through your history in a similar way, but only by matching characters at the beginning of the line.\n",
"That is, if you type **`def`** and then press Ctrl-p, it will find the most recent command (if any) in your history that begins with the characters `def`."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Miscellaneous Shortcuts\n",
"\n",
"Finally, there are a few miscellaneous shortcuts that don't fit into any of the preceding categories, but are nevertheless useful to know:\n",
"\n",
"| Keystroke | Action |\n",
"|-----------------------------|--------------------------------------------|\n",
"| Ctrl-l | Clear terminal screen |\n",
"| Ctrl-c | Interrupt current Python command |\n",
"| Ctrl-d | Exit IPython session |\n",
"\n",
"The Ctrl-c shortcut in particular can be useful when you inadvertently start a very long-running job."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"While some of the shortcuts discussed here may seem a bit obscure at first, they quickly become automatic with practice.\n",
"Once you develop that muscle memory, I suspect you will even find yourself wishing they were available in other contexts."
]
}
],
"metadata": {
"anaconda-cloud": {},
"jupytext": {
"formats": "ipynb,md"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: notebooks/01.03-Magic-Commands.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# IPython Magic Commands"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The previous chapter showed how IPython lets you use and explore Python efficiently and interactively.\n",
"Here we'll begin discussing some of the enhancements that IPython adds on top of the normal Python syntax.\n",
"These are known in IPython as *magic commands*, and are prefixed by the `%` character.\n",
"These magic commands are designed to succinctly solve various common problems in standard data analysis.\n",
"Magic commands come in two flavors: *line magics*, which are denoted by a single `%` prefix and operate on a single line of input, and *cell magics*, which are denoted by a double `%%` prefix and operate on multiple lines of input.\n",
"I'll demonstrate and discuss a few brief examples here, and come back to a more focused discussion of several useful magic commands later."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Running External Code: %run\n",
"As you begin developing more extensive code, you will likely find yourself working in IPython for interactive exploration, as well as a text editor to store code that you want to reuse.\n",
"Rather than running this code in a new window, it can be convenient to run it within your IPython session.\n",
"This can be done with the `%run` magic command.\n",
"\n",
"For example, imagine you've created a *myscript.py* file with the following contents:\n",
"\n",
"```python\n",
"# file: myscript.py\n",
"\n",
"def square(x):\n",
" \"\"\"square a number\"\"\"\n",
" return x ** 2\n",
"\n",
"for N in range(1, 4):\n",
" print(f\"{N} squared is {square(N)}\")\n",
"```\n",
"\n",
"You can execute this from your IPython session as follows:\n",
"\n",
"```ipython\n",
"In [6]: %run myscript.py\n",
"1 squared is 1\n",
"2 squared is 4\n",
"3 squared is 9\n",
"```\n",
"\n",
"Note also that after you've run this script, any functions defined within it are available for use in your IPython session:\n",
"\n",
"```ipython\n",
"In [7]: square(5)\n",
"Out[7]: 25\n",
"```\n",
"\n",
"There are several options to fine-tune how your code is run; you can see the documentation in the normal way, by typing **`%run?`** in the IPython interpreter."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Timing Code Execution: %timeit\n",
"Another example of a useful magic function is `%timeit`, which will automatically determine the execution time of the single-line Python statement that follows it.\n",
"For example, we may want to check the performance of a list comprehension:\n",
"\n",
"```ipython\n",
"In [8]: %timeit L = [n ** 2 for n in range(1000)]\n",
"430 µs ± 3.21 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n",
"```\n",
"\n",
"The benefit of `%timeit` is that for short commands it will automatically perform multiple runs in order to attain more robust results.\n",
"For multiline statements, adding a second `%` sign will turn this into a cell magic that can handle multiple lines of input.\n",
"For example, here's the equivalent construction with a `for` loop:\n",
"\n",
"```ipython\n",
"In [9]: %%timeit\n",
" ...: L = []\n",
" ...: for n in range(1000):\n",
" ...: L.append(n ** 2)\n",
" ...: \n",
"484 µs ± 5.67 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n",
"```\n",
"\n",
"We can immediately see that list comprehensions are about 10% faster than the equivalent `for` loop construction in this case.\n",
"We'll explore `%timeit` and other approaches to timing and profiling code in [Profiling and Timing Code](01.07-Timing-and-Profiling.ipynb)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Help on Magic Functions: ?, %magic, and %lsmagic\n",
"\n",
"Like normal Python functions, IPython magic functions have docstrings, and this useful\n",
"documentation can be accessed in the standard manner.\n",
"So, for example, to read the documentation of the `%timeit` magic function, simply type this:\n",
"\n",
"```ipython\n",
"In [10]: %timeit?\n",
"```\n",
"\n",
"Documentation for other functions can be accessed similarly.\n",
"To access a general description of available magic functions, including some examples, you can type this:\n",
"\n",
"```ipython\n",
"In [11]: %magic\n",
"```\n",
"\n",
"For a quick and simple list of all available magic functions, type this:\n",
"\n",
"```ipython\n",
"In [12]: %lsmagic\n",
"```\n",
"\n",
"Finally, I'll mention that it is quite straightforward to define your own magic functions if you wish.\n",
"I won't discuss it here, but if you are interested, see the references listed in [More IPython Resources](01.08-More-IPython-Resources.ipynb)."
]
}
],
"metadata": {
"anaconda-cloud": {},
"jupytext": {
"formats": "ipynb,md"
},
"kernelspec": {
"display_name": "Python 3.9.6 64-bit ('3.9.6')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
},
"vscode": {
"interpreter": {
"hash": "513788764cd0ec0f97313d5418a13e1ea666d16d72f976a8acadce25a5af2ffc"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: notebooks/01.04-Input-Output-History.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Input and Output History"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Previously you saw that the IPython shell allows you to access previous commands with the up and down arrow keys, or equivalently the Ctrl-p/Ctrl-n shortcuts.\n",
"Additionally, in both the shell and notebooks, IPython exposes several ways to obtain the output of previous commands, as well as string versions of the commands themselves.\n",
"We'll explore those here."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## IPython's In and Out Objects\n",
"\n",
"By now I imagine you're becoming familiar with the `In [1]:`/`Out[1]:` style of prompts used by IPython.\n",
"But it turns out that these are not just pretty decoration: they give a clue as to how you can access previous inputs and outputs in your current session.\n",
"Suppose we start a session that looks like this:\n",
"\n",
"```ipython\n",
"In [1]: import math\n",
"\n",
"In [2]: math.sin(2)\n",
"Out[2]: 0.9092974268256817\n",
"\n",
"In [3]: math.cos(2)\n",
"Out[3]: -0.4161468365471424\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We've imported the built-in `math` package, then computed the sine and the cosine of the number 2.\n",
"These inputs and outputs are displayed in the shell with `In`/`Out` labels, but there's more—IPython actually creates some Python variables called `In` and `Out` that are automatically updated to reflect this history:\n",
"\n",
"```ipython\n",
"In [4]: In\n",
"Out[4]: ['', 'import math', 'math.sin(2)', 'math.cos(2)', 'In']\n",
"\n",
"In [5]: Out\n",
"Out[5]:\n",
"{2: 0.9092974268256817,\n",
" 3: -0.4161468365471424,\n",
" 4: ['', 'import math', 'math.sin(2)', 'math.cos(2)', 'In', 'Out']}\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `In` object is a list, which keeps track of the commands in order (the first item in the list is a placeholder so that `In [1]` can refer to the first command):\n",
"\n",
"```ipython\n",
"In [6]: print(In[1])\n",
"import math\n",
"```\n",
"\n",
"The `Out` object is not a list but a dictionary mapping input numbers to their outputs (if any):\n",
"\n",
"```ipython\n",
"In [7]: print(Out[2])\n",
"0.9092974268256817\n",
"```\n",
"\n",
"Note that not all operations have outputs: for example, `import` statements and `print` statements don't affect the output.\n",
"The latter may be surprising, but makes sense if you consider that `print` is a function that returns `None`; for brevity, any command that returns `None` is not added to `Out`.\n",
"\n",
"Where this can be useful is if you want to interact with past results.\n",
"For example, let's check the sum of `sin(2) ** 2` and `cos(2) ** 2` using the previously computed results:\n",
"\n",
"```ipython\n",
"In [8]: Out[2] ** 2 + Out[3] ** 2\n",
"Out[8]: 1.0\n",
"```\n",
"\n",
"The result is `1.0`, as we'd expect from the well-known trigonometric identity.\n",
"In this case, using these previous results probably is not necessary, but it can become quite handy if you execute a very expensive computation and forget to assign the result to a variable."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Underscore Shortcuts and Previous Outputs\n",
"\n",
"The standard Python shell contains just one simple shortcut for accessing previous output: the variable `_` (i.e., a single underscore) is kept updated with the previous output. This works in IPython as well:\n",
"\n",
"```ipython\n",
"In [9]: print(_)\n",
"1.0\n",
"```\n",
"\n",
"But IPython takes this a bit further—you can use a double underscore to access the second-to-last output, and a triple underscore to access the third-to-last output (skipping any commands with no output):\n",
"\n",
"```ipython\n",
"In [10]: print(__)\n",
"-0.4161468365471424\n",
"\n",
"In [11]: print(___)\n",
"0.9092974268256817\n",
"```\n",
"\n",
"IPython stops there: more than three underscores starts to get a bit hard to count, and at that point it's easier to refer to the output by line number.\n",
"\n",
"There is one more shortcut I should mention, however—a shorthand for `Out[X]` is `_X` (i.e., a single underscore followed by the line number):\n",
"\n",
"```ipython\n",
"In [12]: Out[2]\n",
"Out[12]: 0.9092974268256817\n",
"\n",
"In [13]: _2\n",
"Out[13]: 0.9092974268256817\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Suppressing Output\n",
"Sometimes you might wish to suppress the output of a statement (this is perhaps most common with the plotting commands that we'll explore in [Introduction to Matplotlib](04.00-Introduction-To-Matplotlib.ipynb)).\n",
"Or maybe the command you're executing produces a result that you'd prefer not to store in your output history, perhaps so that it can be deallocated when other references are removed.\n",
"The easiest way to suppress the output of a command is to add a semicolon to the end of the line:\n",
"\n",
"```ipython\n",
"In [14]: math.sin(2) + math.cos(2);\n",
"```\n",
"\n",
"The result is computed silently, and the output is neither displayed on the screen nor stored in the `Out` dictionary:\n",
"\n",
"```ipython\n",
"In [15]: 14 in Out\n",
"Out[15]: False\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Related Magic Commands\n",
"For accessing a batch of previous inputs at once, the `%history` magic command is very helpful.\n",
"Here is how you can print the first four inputs:\n",
"\n",
"```ipython\n",
"In [16]: %history -n 1-3\n",
" 1: import math\n",
" 2: math.sin(2)\n",
" 3: math.cos(2)\n",
"```\n",
"\n",
"As usual, you can type `%history?` for more information and a description of options available (see [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb) for details on the `?` functionality).\n",
"Other useful magic commands are `%rerun`, which will re-execute some portion of the command history, and `%save`, which saves some set of the command history to a file)."
]
}
],
"metadata": {
"anaconda-cloud": {},
"jupytext": {
"formats": "ipynb,md"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: notebooks/01.05-IPython-And-Shell-Commands.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# IPython and Shell Commands"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"When working interactively with the standard Python interpreter, one of the frustrations is the need to switch between multiple windows to access Python tools and system command-line tools.\n",
"IPython bridges this gap, and gives you a syntax for executing shell commands directly from within the IPython terminal.\n",
"The magic happens with the exclamation point: anything appearing after `!` on a line will be executed not by the Python kernel, but by the system command line.\n",
"\n",
"The following discussion assumes you're on a Unix-like system, such as Linux or macOS.\n",
"Some of the examples that follow will fail on Windows, which uses a different type of shell by default, though if you use the *Windows Subsystem for Linux* the examples here should run correctly.\n",
"If you're unfamiliar with shell commands, I'd suggest reviewing the [Unix shell tutorial](http://swcarpentry.github.io/shell-novice/) put together by the always excellent Software Carpentry Foundation."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Quick Introduction to the Shell\n",
"\n",
"A full introduction to using the shell/terminal/command line is well beyond the scope of this chapter, but for the uninitiated I will offer a quick introduction here.\n",
"The shell is a way to interact textually with your computer.\n",
"Ever since the mid-1980s, when Microsoft and Apple introduced the first versions of their now ubiquitous graphical operating systems, most computer users have interacted with their operating systems through the familiar menu selections and drag-and-drop movements.\n",
"But operating systems existed long before these graphical user interfaces, and were primarily controlled through sequences of text input: at the prompt, the user would type a command, and the computer would do what the user told it to.\n",
"Those early prompt systems were the precursors of the shells and terminals that most data scientists still use today.\n",
"\n",
"Someone unfamiliar with the shell might ask why you would bother with this, when many of the same results can be accomplished by simply clicking on icons and menus.\n",
"A shell user might reply with another question: why hunt for icons and menu items when you can accomplish things much more easily by typing?\n",
"While it might sound like a typical tech preference impasse, when moving beyond basic tasks it quickly becomes clear that the shell offers much more control of advanced tasks—though admittedly the learning curve can be intimidating.\n",
"\n",
"As an example, here is a sample of a Linux/macOS shell session where a user explores, creates, and modifies directories and files on their system (`osx:~ $` is the prompt, and everything after the `$` is the typed command; text that is preceded by a `#` is meant just as description, rather than something you would actually type in):\n",
"\n",
"```bash\n",
"osx:~ $ echo \"hello world\" # echo is like Python's print function\n",
"hello world\n",
"\n",
"osx:~ $ pwd # pwd = print working directory\n",
"/home/jake # This is the \"path\" that we're sitting in\n",
"\n",
"osx:~ $ ls # ls = list working directory contents\n",
"notebooks projects \n",
"\n",
"osx:~ $ cd projects/ # cd = change directory\n",
"\n",
"osx:projects $ pwd\n",
"/home/jake/projects\n",
"\n",
"osx:projects $ ls\n",
"datasci_book mpld3 myproject.txt\n",
"\n",
"osx:projects $ mkdir myproject # mkdir = make new directory\n",
"\n",
"osx:projects $ cd myproject/\n",
"\n",
"osx:myproject $ mv ../myproject.txt ./ # mv = move file. Here we're moving the\n",
" # file myproject.txt from one directory\n",
" # up (../) to the current directory (./).\n",
"osx:myproject $ ls\n",
"myproject.txt\n",
"```\n",
"\n",
"Notice that all of this is just a compact way to do familiar operations (navigating a directory structure, creating a directory, moving a file, etc.) by typing commands rather than clicking icons and menus.\n",
"With just a few commands (`pwd`, `ls`, `cd`, `mkdir`, and `cp`) you can do many of the most common file operations, but it's when you go beyond these basics that the shell approach becomes really powerful."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Shell Commands in IPython\n",
"\n",
"Any standard shell command can be used directly in IPython by prefixing it with the `!` character.\n",
"For example, the `ls`, `pwd`, and `echo` commands can be run as follows:\n",
"\n",
"```ipython\n",
"In [1]: !ls\n",
"myproject.txt\n",
"\n",
"In [2]: !pwd\n",
"/home/jake/projects/myproject\n",
"\n",
"In [3]: !echo \"printing from the shell\"\n",
"printing from the shell\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Passing Values to and from the Shell\n",
"\n",
"Shell commands not only can be called from IPython, but can also be made to interact with the IPython namespace.\n",
"For example, you can save the output of any shell command to a Python list using the assignment operator, `=`:\n",
"\n",
"```ipython\n",
"In [4]: contents = !ls\n",
"\n",
"In [5]: print(contents)\n",
"['myproject.txt']\n",
"\n",
"In [6]: directory = !pwd\n",
"\n",
"In [7]: print(directory)\n",
"['/Users/jakevdp/notebooks/tmp/myproject']\n",
"```\n",
"\n",
"These results are not returned as lists, but as a special shell return type defined in IPython:\n",
"\n",
"```ipython\n",
"In [8]: type(directory)\n",
"IPython.utils.text.SList\n",
"```\n",
"\n",
"This looks and acts a lot like a Python list but has additional functionality, such as\n",
"the `grep` and `fields` methods and the `s`, `n`, and `p` properties that allow you to search, filter, and display the results in convenient ways.\n",
"For more information on these, you can use IPython's built-in help features."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Communication in the other direction—passing Python variables into the shell—is possible using the `{varname}` syntax:\n",
"\n",
"```ipython\n",
"In [9]: message = \"hello from Python\"\n",
"\n",
"In [10]: !echo {message}\n",
"hello from Python\n",
"```\n",
"\n",
"The curly braces contain the variable name, which is replaced by the variable's contents in the shell command."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Shell-Related Magic Commands\n",
"\n",
"If you play with IPython's shell commands for a while, you might notice that you cannot use `!cd` to navigate the filesystem:\n",
"\n",
"```ipython\n",
"In [11]: !pwd\n",
"/home/jake/projects/myproject\n",
"\n",
"In [12]: !cd ..\n",
"\n",
"In [13]: !pwd\n",
"/home/jake/projects/myproject\n",
"```\n",
"\n",
"The reason is that shell commands in the notebook are executed in a temporary subshell that does not maintain state from command to command.\n",
"If you'd like to change the working directory in a more enduring way, you can use the `%cd` magic command:\n",
"\n",
"```ipython\n",
"In [14]: %cd ..\n",
"/home/jake/projects\n",
"```\n",
"\n",
"In fact, by default you can even use this without the `%` sign:\n",
"\n",
"```ipython\n",
"In [15]: cd myproject\n",
"/home/jake/projects/myproject\n",
"```\n",
"\n",
"This is known as an *automagic* function, and the ability to execute such commands without an explicit `%` can be toggled with the `%automagic` magic function.\n",
"\n",
"Besides `%cd`, other available shell-like magic functions are `%cat`, `%cp`, `%env`, `%ls`, `%man`, `%mkdir`, `%more`, `%mv`, `%pwd`, `%rm`, and `%rmdir`, any of which can be used without the `%` sign if `automagic` is on.\n",
"This makes it so that you can almost treat the IPython prompt as if it's a normal shell:\n",
"\n",
"```ipython\n",
"In [16]: mkdir tmp\n",
"\n",
"In [17]: ls\n",
"myproject.txt tmp/\n",
"\n",
"In [18]: cp myproject.txt tmp/\n",
"\n",
"In [19]: ls tmp\n",
"myproject.txt\n",
"\n",
"In [20]: rm -r tmp\n",
"```\n",
"\n",
"This access to the shell from within the same terminal window as your Python session lets you more naturally combine Python and the shell in your workflows with fewer context switches."
]
}
],
"metadata": {
"anaconda-cloud": {},
"jupytext": {
"formats": "ipynb,md"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: notebooks/01.06-Errors-and-Debugging.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Errors and Debugging"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Code development and data analysis always require a bit of trial and error, and IPython contains tools to streamline this process.\n",
"This section will briefly cover some options for controlling Python's exception reporting, followed by exploring tools for debugging errors in code."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Controlling Exceptions: %xmode\n",
"\n",
"Most of the time when a Python script fails, it will raise an exception.\n",
"When the interpreter hits one of these exceptions, information about the cause of the error can be found in the *traceback*, which can be accessed from within Python.\n",
"With the `%xmode` magic function, IPython allows you to control the amount of information printed when the exception is raised.\n",
"Consider the following code:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"def func1(a, b):\n",
" return a / b\n",
"\n",
"def func2(x):\n",
" a = x\n",
" b = x - 1\n",
" return func1(a, b)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"ename": "ZeroDivisionError",
"evalue": "division by zero",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mZeroDivisionError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-2-b2e110f6fc8f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfunc2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-1-d849e34d61fb>\u001b[0m in \u001b[0;36mfunc2\u001b[0;34m(x)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0ma\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc1\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-1-d849e34d61fb>\u001b[0m in \u001b[0;36mfunc1\u001b[0;34m(a, b)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfunc1\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0ma\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfunc2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0ma\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mZeroDivisionError\u001b[0m: division by zero"
]
}
],
"source": [
"func2(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Calling `func2` results in an error, and reading the printed trace lets us see exactly what happened.\n",
"In the default mode, this trace includes several lines showing the context of each step that led to the error.\n",
"Using the `%xmode` magic function (short for *exception mode*), we can change what information is printed.\n",
"\n",
"`%xmode` takes a single argument, the mode, and there are three possibilities: `Plain`, `Context`, and `Verbose`.\n",
"The default is `Context`, which gives output like that just shown.\n",
"`Plain` is more compact and gives less information:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Exception reporting mode: Plain\n"
]
}
],
"source": [
"%xmode Plain"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"ename": "ZeroDivisionError",
"evalue": "division by zero",
"output_type": "error",
"traceback": [
"Traceback \u001b[0;36m(most recent call last)\u001b[0m:\n",
" File \u001b[1;32m\"<ipython-input-4-b2e110f6fc8f>\"\u001b[0m, line \u001b[1;32m1\u001b[0m, in \u001b[1;35m<module>\u001b[0m\n func2(1)\n",
" File \u001b[1;32m\"<ipython-input-1-d849e34d61fb>\"\u001b[0m, line \u001b[1;32m7\u001b[0m, in \u001b[1;35mfunc2\u001b[0m\n return func1(a, b)\n",
"\u001b[0;36m File \u001b[0;32m\"<ipython-input-1-d849e34d61fb>\"\u001b[0;36m, line \u001b[0;32m2\u001b[0;36m, in \u001b[0;35mfunc1\u001b[0;36m\u001b[0m\n\u001b[0;31m return a / b\u001b[0m\n",
"\u001b[0;31mZeroDivisionError\u001b[0m\u001b[0;31m:\u001b[0m division by zero\n"
]
}
],
"source": [
"func2(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `Verbose` mode adds some extra information, including the arguments to any functions that are called:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Exception reporting mode: Verbose\n"
]
}
],
"source": [
"%xmode Verbose"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"ename": "ZeroDivisionError",
"evalue": "division by zero",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mZeroDivisionError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-6-b2e110f6fc8f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfunc2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m \u001b[0;36mglobal\u001b[0m \u001b[0;36mfunc2\u001b[0m \u001b[0;34m= <function func2 at 0x103729320>\u001b[0m\n",
"\u001b[0;32m<ipython-input-1-d849e34d61fb>\u001b[0m in \u001b[0;36mfunc2\u001b[0;34m(x=1)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0ma\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc1\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m \u001b[0;36mglobal\u001b[0m \u001b[0;36mfunc1\u001b[0m \u001b[0;34m= <function func1 at 0x1037294d0>\u001b[0m\u001b[0;34m\n \u001b[0m\u001b[0;36ma\u001b[0m \u001b[0;34m= 1\u001b[0m\u001b[0;34m\n \u001b[0m\u001b[0;36mb\u001b[0m \u001b[0;34m= 0\u001b[0m\n",
"\u001b[0;32m<ipython-input-1-d849e34d61fb>\u001b[0m in \u001b[0;36mfunc1\u001b[0;34m(a=1, b=0)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfunc1\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0ma\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m \u001b[0;36ma\u001b[0m \u001b[0;34m= 1\u001b[0m\u001b[0;34m\n \u001b[0m\u001b[0;36mb\u001b[0m \u001b[0;34m= 0\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfunc2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0ma\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mZeroDivisionError\u001b[0m: division by zero"
]
}
],
"source": [
"func2(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This extra information can help you narrow in on why the exception is being raised.\n",
"So why not use the `Verbose` mode all the time?\n",
"As code gets complicated, this kind of traceback can get extremely long.\n",
"Depending on the context, sometimes the brevity of `Plain` or `Context` mode is easier to work with."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Debugging: When Reading Tracebacks Is Not Enough\n",
"\n",
"The standard Python tool for interactive debugging is `pdb`, the Python debugger.\n",
"This debugger lets the user step through the code line by line in order to see what might be causing a more difficult error.\n",
"The IPython-enhanced version of this is `ipdb`, the IPython debugger.\n",
"\n",
"There are many ways to launch and use both these debuggers; we won't cover them fully here.\n",
"Refer to the online documentation of these two utilities to learn more.\n",
"\n",
"In IPython, perhaps the most convenient interface to debugging is the `%debug` magic command.\n",
"If you call it after hitting an exception, it will automatically open an interactive debugging prompt at the point of the exception.\n",
"The `ipdb` prompt lets you explore the current state of the stack, explore the available variables, and even run Python commands!\n",
"\n",
"Let's look at the most recent exception, then do some basic tasks. We'll print the values of `a` and `b`, then type `quit` to quit the debugging session:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"> <ipython-input-1-d849e34d61fb>(2)func1()\n",
" 1 def func1(a, b):\n",
"----> 2 return a / b\n",
" 3 \n",
"\n",
"ipdb> print(a)\n",
"1\n",
"ipdb> print(b)\n",
"0\n",
"ipdb> quit\n"
]
}
],
"source": [
"%debug"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The interactive debugger allows much more than this, though—we can even step up and down through the stack and explore the values of variables there:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"> <ipython-input-1-d849e34d61fb>(2)func1()\n",
" 1 def func1(a, b):\n",
"----> 2 return a / b\n",
" 3 \n",
"\n",
"ipdb> up\n",
"> <ipython-input-1-d849e34d61fb>(7)func2()\n",
" 5 a = x\n",
" 6 b = x - 1\n",
"----> 7 return func1(a, b)\n",
"\n",
"ipdb> print(x)\n",
"1\n",
"ipdb> up\n",
"> <ipython-input-6-b2e110f6fc8f>(1)<module>()\n",
"----> 1 func2(1)\n",
"\n",
"ipdb> down\n",
"> <ipython-input-1-d849e34d61fb>(7)func2()\n",
" 5 a = x\n",
" 6 b = x - 1\n",
"----> 7 return func1(a, b)\n",
"\n",
"ipdb> quit\n"
]
}
],
"source": [
"%debug"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This allows us to quickly find out not only what caused the error, but what function calls led up to the error.\n",
"\n",
"If you'd like the debugger to launch automatically whenever an exception is raised, you can use the `%pdb` magic function to turn on this automatic behavior:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Exception reporting mode: Plain\n",
"Automatic pdb calling has been turned ON\n"
]
},
{
"ename": "ZeroDivisionError",
"evalue": "division by zero",
"output_type": "error",
"traceback": [
"Traceback \u001b[0;36m(most recent call last)\u001b[0m:\n",
" File \u001b[1;32m\"<ipython-input-9-569a67d2d312>\"\u001b[0m, line \u001b[1;32m3\u001b[0m, in \u001b[1;35m<module>\u001b[0m\n func2(1)\n",
" File \u001b[1;32m\"<ipython-input-1-d849e34d61fb>\"\u001b[0m, line \u001b[1;32m7\u001b[0m, in \u001b[1;35mfunc2\u001b[0m\n return func1(a, b)\n",
"\u001b[0;36m File \u001b[0;32m\"<ipython-input-1-d849e34d61fb>\"\u001b[0;36m, line \u001b[0;32m2\u001b[0;36m, in \u001b[0;35mfunc1\u001b[0;36m\u001b[0m\n\u001b[0;31m return a / b\u001b[0m\n",
"\u001b[0;31mZeroDivisionError\u001b[0m\u001b[0;31m:\u001b[0m division by zero\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"> <ipython-input-1-d849e34d61fb>(2)func1()\n",
" 1 def func1(a, b):\n",
"----> 2 return a / b\n",
" 3 \n",
"\n",
"ipdb> print(b)\n",
"0\n",
"ipdb> quit\n"
]
}
],
"source": [
"%xmode Plain\n",
"%pdb on\n",
"func2(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, if you have a script that you'd like to run from the beginning in interactive mode, you can run it with the command `%run -d`, and use the `next` command to step through the lines of code interactively."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Partial list of debugging commands\n",
"\n",
"There are many more available commands for interactive debugging than I've shown here. The following table contains a description of some of the more common and useful ones:\n",
"\n",
"| Command | Description |\n",
"|---------------|-------------------------------------------------------------|\n",
"| `l(ist)` | Show the current location in the file |\n",
"| `h(elp)` | Show a list of commands, or find help on a specific command |\n",
"| `q(uit)` | Quit the debugger and the program |\n",
"| `c(ontinue)` | Quit the debugger, continue in the program |\n",
"| `n(ext)` | Go to the next step of the program |\n",
"| `<enter>` | Repeat the previous command |\n",
"| `p(rint)` | Print variables |\n",
"| `s(tep)` | Step into a subroutine |\n",
"| `r(eturn)` | Return out of a subroutine |\n",
"\n",
"For more information, use the `help` command in the debugger, or take a look at `ipdb`'s [online documentation](https://github.com/gotcha/ipdb)."
]
}
],
"metadata": {
"anaconda-cloud": {},
"jupytext": {
"formats": "ipynb,md"
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: notebooks/01.07-Timing-and-Profiling.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Profiling and Timing Code"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the process of developing code and creating data processing pipelines, there are often trade-offs you can make between various implementations.\n",
"Early in developing your algorithm, it can be counterproductive to worry about such things. As Donald Knuth famously quipped, \"We should forget about small efficiencies, say about 97% of the time: premature optimization is the root of all evil.\"\n",
"\n",
"But once you have your code working, it can be useful to dig into its efficiency a bit.\n",
"Sometimes it's useful to check the execution time of a given command or set of commands; other times it's useful to examine a multiline process and determine where the bottleneck lies in some complicated series of operations.\n",
"IPython provides access to a wide array of functionality for this kind of timing and profiling of code.\n",
"Here we'll discuss the following IPython magic commands:\n",
"\n",
"- `%time`: Time the execution of a single statement\n",
"- `%timeit`: Time repeated execution of a single statement for more accuracy\n",
"- `%prun`: Run code with the profiler\n",
"- `%lprun`: Run code with the line-by-line profiler\n",
"- `%memit`: Measure the memory use of a single statement\n",
"- `%mprun`: Run code with the line-by-line memory profiler\n",
"\n",
"The last four commands are not bundled with IPython; to use them you'll need to get the `line_profiler` and `memory_profiler` extensions, which we will discuss in the following sections."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Timing Code Snippets: %timeit and %time\n",
"\n",
"We saw the `%timeit` line magic and `%%timeit` cell magic in the introduction to magic functions in [IPython Magic Commands](01.03-Magic-Commands.ipynb); these can be used to time the repeated execution of snippets of code:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.53 µs ± 47.8 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n"
]
}
],
"source": [
"%timeit sum(range(100))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that because this operation is so fast, `%timeit` automatically does a large number of repetitions.\n",
"For slower commands, `%timeit` will automatically adjust and perform fewer repetitions:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"536 ms ± 15.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"total = 0\n",
"for i in range(1000):\n",
" for j in range(1000):\n",
" total += i * (-1) ** j"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Sometimes repeating an operation is not the best option.\n",
"For example, if we have a list that we'd like to sort, we might be misled by a repeated operation; sorting a pre-sorted list is much faster than sorting an unsorted list, so the repetition will skew the result:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.71 ms ± 334 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
]
}
],
"source": [
"import random\n",
"L = [random.random() for i in range(100000)]\n",
"%timeit L.sort()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For this, the `%time` magic function may be a better choice. It also is a good choice for longer-running commands, when short, system-related delays are unlikely to affect the result.\n",
"Let's time the sorting of an unsorted and a presorted list:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"sorting an unsorted list:\n",
"CPU times: user 31.3 ms, sys: 686 µs, total: 32 ms\n",
"Wall time: 33.3 ms\n"
]
}
],
"source": [
"import random\n",
"L = [random.random() for i in range(100000)]\n",
"print(\"sorting an unsorted list:\")\n",
"%time L.sort()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"sorting an already sorted list:\n",
"CPU times: user 5.19 ms, sys: 268 µs, total: 5.46 ms\n",
"Wall time: 14.1 ms\n"
]
}
],
"source": [
"print(\"sorting an already sorted list:\")\n",
"%time L.sort()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Notice how much faster the presorted list is to sort, but notice also how much longer the timing takes with `%time` versus `%timeit`, even for the presorted list!\n",
"This is a result of the fact that `%timeit` does some clever things under the hood to prevent system calls from interfering with the timing.\n",
"For example, it prevents cleanup of unused Python objects (known as *garbage collection*) that might otherwise affect the timing.\n",
"For this reason, `%timeit` results are usually noticeably faster than `%time` results.\n",
"\n",
"For `%time`, as with `%timeit`, using the `%%` cell magic syntax allows timing of multiline scripts:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 655 ms, sys: 5.68 ms, total: 661 ms\n",
"Wall time: 710 ms\n"
]
}
],
"source": [
"%%time\n",
"total = 0\n",
"for i in range(1000):\n",
" for j in range(1000):\n",
" total += i * (-1) ** j"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For more information on `%time` and `%timeit`, as well as their available options, use the IPython help functionality (e.g., type `%time?` at the IPython prompt)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Profiling Full Scripts: %prun\n",
"\n",
"A program is made up of many single statements, and sometimes timing these statements in context is more important than timing them on their own.\n",
"Python contains a built-in code profiler (which you can read about in the Python documentation), but IPython offers a much more convenient way to use this profiler, in the form of the magic function `%prun`.\n",
"\n",
"By way of example, we'll define a simple function that does some calculations:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def sum_of_lists(N):\n",
" total = 0\n",
" for i in range(5):\n",
" L = [j ^ (j >> i) for j in range(N)]\n",
" total += sum(L)\n",
" return total"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can call `%prun` with a function call to see the profiled results:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" "
]
},
{
"data": {
"text/plain": [
" 14 function calls in 0.932 seconds\n",
"\n",
" Ordered by: internal time\n",
"\n",
" ncalls tottime percall cumtime percall filename:lineno(function)\n",
" 5 0.808 0.162 0.808 0.162 <ipython-input-7-f105717832a2>:4(<listcomp>)\n",
" 5 0.066 0.013 0.066 0.013 {built-in method builtins.sum}\n",
" 1 0.044 0.044 0.918 0.918 <ipython-input-7-f105717832a2>:1(sum_of_lists)\n",
" 1 0.014 0.014 0.932 0.932 <string>:1(<module>)\n",
" 1 0.000 0.000 0.932 0.932 {built-in method builtins.exec}\n",
" 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%prun sum_of_lists(1000000)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The result is a table that indicates, in order of total time on each function call, where the execution is spending the most time. In this case, the bulk of the execution time is in the list comprehension inside `sum_of_lists`.\n",
"From here, we could start thinking about what changes we might make to improve the performance of the algorithm.\n",
"\n",
"For more information on `%prun`, as well as its available options, use the IPython help functionality (i.e., type `%prun?` at the IPython prompt)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Line-by-Line Profiling with %lprun\n",
"\n",
"The function-by-function profiling of `%prun` is useful, but sometimes it's more convenient to have a line-by-line profile report.\n",
"This is not built into Python or IPython, but there is a `line_profiler` package available for installation that can do this.\n",
"Start by using Python's packaging tool, `pip`, to install the `line_profiler` package:\n",
"\n",
"```\n",
"$ pip install line_profiler\n",
"```\n",
"\n",
"Next, you can use IPython to load the `line_profiler` IPython extension, offered as part of this package:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"%load_ext line_profiler"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now the `%lprun` command will do a line-by-line profiling of any function. In this case, we need to tell it explicitly which functions we're interested in profiling:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timer unit: 1e-06 s\n",
"\n",
"Total time: 0.014803 s\n",
"File: <ipython-input-7-f105717832a2>\n",
"Function: sum_of_lists at line 1\n",
"\n",
"Line # Hits Time Per Hit % Time Line Contents\n",
"==============================================================\n",
" 1 def sum_of_lists(N):\n",
" 2 1 6.0 6.0 0.0 total = 0\n",
" 3 6 13.0 2.2 0.1 for i in range(5):\n",
" 4 5 14242.0 2848.4 96.2 L = [j ^ (j >> i) for j in range(N)]\n",
" 5 5 541.0 108.2 3.7 total += sum(L)\n",
" 6 1 1.0 1.0 0.0 return total"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%lprun -f sum_of_lists sum_of_lists(5000)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The information at the top gives us the key to reading the results: the time is reported in microseconds, and we can see where the program is spending the most time.\n",
"At this point, we may be able to use this information to modify aspects of the script and make it perform better for our desired use case.\n",
"\n",
"For more information on `%lprun`, as well as its available options, use the IPython help functionality (i.e., type `%lprun?` at the IPython prompt)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Profiling Memory Use: %memit and %mprun\n",
"\n",
"Another aspect of profiling is the amount of memory an operation uses.\n",
"This can be evaluated with another IPython extension, the `memory_profiler`.\n",
"As with the `line_profiler`, we start by `pip`-installing the extension:\n",
"\n",
"```\n",
"$ pip install memory_profiler\n",
"```\n",
"\n",
"Then we can use IPython to load it:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"%load_ext memory_profiler"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The memory profiler extension contains two useful magic functions: `%memit` (which offers a memory-measuring equivalent of `%timeit`) and `%mprun` (which offers a memory-measuring equivalent of `%lprun`).\n",
"The `%memit` magic function can be used rather simply:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"peak memory: 141.70 MiB, increment: 75.65 MiB\n"
]
}
],
"source": [
"%memit sum_of_lists(1000000)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We see that this function uses about 140 MB of memory.\n",
"\n",
"For a line-by-line description of memory use, we can use the `%mprun` magic function.\n",
"Unfortunately, this works only for functions defined in separate modules rather than the notebook itself, so we'll start by using the `%%file` cell magic to create a simple module called `mprun_demo.py`, which contains our `sum_of_lists` function, with one addition that will make our memory profiling results more clear:"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Overwriting mprun_demo.py\n"
]
}
],
"source": [
"%%file mprun_demo.py\n",
"def sum_of_lists(N):\n",
" total = 0\n",
" for i in range(5):\n",
" L = [j ^ (j >> i) for j in range(N)]\n",
" total += sum(L)\n",
" del L # remove reference to L\n",
" return total"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can now import the new version of this function and run the memory line profiler:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"text/plain": [
"Filename: /Users/jakevdp/github/jakevdp/PythonDataScienceHandbook/notebooks_v2/mprun_demo.py\n",
"\n",
"Line # Mem usage Increment Occurences Line Contents\n",
"============================================================\n",
" 1 66.7 MiB 66.7 MiB 1 def sum_of_lists(N):\n",
" 2 66.7 MiB 0.0 MiB 1 total = 0\n",
" 3 75.1 MiB 8.4 MiB 6 for i in range(5):\n",
" 4 105.9 MiB 30.8 MiB 5000015 L = [j ^ (j >> i) for j in range(N)]\n",
" 5 109.8 MiB 3.8 MiB 5 total += sum(L)\n",
" 6 75.1 MiB -34.6 MiB 5 del L # remove reference to L\n",
" 7 66.9 MiB -8.2 MiB 1 return total"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from mprun_demo import sum_of_lists\n",
"%mprun -f sum_of_lists sum_of_lists(1000000)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here, the `Increment` column tells us how much each line affects the total memory budget: observe that when we create and delete the list `L`, we are adding about 30 MB of memory usage.\n",
"This is on top of the background memory usage from the Python interpreter itself.\n",
"\n",
"For more information on `%memit` and `%mprun`, as well as their available options, use the IPython help functionality (e.g., type `%memit?` at the IPython prompt)."
]
}
],
"metadata": {
"anaconda-cloud": {},
"jupytext": {
"formats": "ipynb,md"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: notebooks/01.08-More-IPython-Resources.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# More IPython Resources"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this set of chapters, we've just scratched the surface of using IPython to enable data science tasks.\n",
"Much more information is available both in print and on the web, and here I'll list some other resources that you may find helpful."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Web Resources\n",
"\n",
"- [The IPython website](http://ipython.org): The IPython website provides links to documentation, examples, tutorials, and a variety of other resources.\n",
"- [The nbviewer website](http://nbviewer.jupyter.org/): This site shows static renderings of any Jupyter notebook available on the internet. The front page features some example notebooks that you can browse to see what other folks are using IPython for!\n",
"- [A curated collection of Jupyter notebooks](https://github.com/jupyter/jupyter/wiki): This ever-growing list of notebooks, powered by nbviewer, shows the depth and breadth of numerical analysis you can do with IPython. It includes everything from short examples and tutorials to full-blown courses and books composed in the notebook format!\n",
"- Video tutorials: Searching the internet, you will find many video tutorials on IPython. I'd especially recommend seeking tutorials from the PyCon, SciPy, and PyData conferences by Fernando Perez and Brian Granger, two of the primary creators and maintainers of IPython and Jupyter."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Books\n",
"\n",
"- [*Python for Data Analysis* (O'Reilly)](http://shop.oreilly.com/product/0636920023784.do): Wes McKinney's book includes a chapter that covers using IPython as a data scientist. Although much of the material overlaps what we've discussed here, another perspective is always helpful.\n",
"- [*Learning IPython for Interactive Computing and Data Visualization* (Packt)](https://www.packtpub.com/big-data-and-business-intelligence/learning-ipython-interactive-computing-and-data-visualization): This short book by Cyrille Rossant offers a good introduction to using IPython for data analysis.\n",
"- [*IPython Interactive Computing and Visualization Cookbook* (Packt)](https://www.packtpub.com/big-data-and-business-intelligence/ipython-interactive-computing-and-visualization-cookbook): Also by Cyrille Rossant, this book is a longer and more advanced treatment of using IPython for data science. Despite its name, it's not just about IPython; it also goes into some depth on a broad range of data science topics.\n",
"\n",
"Finally, a reminder that you can find help on your own: IPython's `?`-based help functionality (discussed in [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb)) can be useful if you use it well and use it often.\n",
"As you go through the examples here and elsewhere, this can be used to familiarize yourself with all the tools that IPython has to offer."
]
}
],
"metadata": {
"anaconda-cloud": {},
"jupytext": {
"formats": "ipynb,md"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: notebooks/02.00-Introduction-to-NumPy.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Introduction to NumPy\n",
"\n",
"This part of the book, along with [Part 3](03.00-Introduction-to-Pandas.ipynb), outlines techniques for effectively loading, storing, and manipulating in-memory data in Python.\n",
"The topic is very broad: datasets can come from a wide range of sources and in a wide range of formats, including collections of documents, collections of images, collections of sound clips, collections of numerical measurements, or nearly anything else.\n",
"Despite this apparent heterogeneity, many datasets can be represented fundamentally as arrays of numbers.\n",
"\n",
"For example, images—particularly digital images—can be thought of as simply two-dimensional arrays of numbers representing pixel brightness across the area.\n",
"Sound clips can be thought of as one-dimensional arrays of intensity versus time.\n",
"Text can be converted in various ways into numerical representations, such as binary digits representing the frequency of certain words or pairs of words.\n",
"No matter what the data is, the first step in making it analyzable will be to transform it into arrays of numbers.\n",
"(We will discuss some specific examples of this process in [Feature Engineering](05.04-Feature-Engineering.ipynb).)\n",
"\n",
"For this reason, efficient storage and manipulation of numerical arrays is absolutely fundamental to the process of doing data science.\n",
"We'll now take a look at the specialized tools that Python has for handling such numerical arrays: the NumPy package and the Pandas package (discussed in [Part 3](03.00-Introduction-to-Pandas.ipynb)).\n",
"\n",
"This part of the book will cover NumPy in detail. NumPy (short for *Numerical Python*) provides an efficient interface to store and operate on dense data buffers.\n",
"In some ways, NumPy arrays are like Python's built-in `list` type, but NumPy arrays provide much more efficient storage and data operations as the arrays grow larger in size.\n",
"NumPy arrays form the core of nearly the entire ecosystem of data science tools in Python, so time spent learning to use NumPy effectively will be valuable no matter what aspect of data science interests you.\n",
"\n",
"If you followed the advice outlined in the Preface and installed the Anaconda stack, you already have NumPy installed and ready to go.\n",
"If you're more the do-it-yourself type, you can go to http://www.numpy.org/ and follow the installation instructions found there.\n",
"Once you do, you can import NumPy and double-check the version:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"'1.21.2'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy\n",
"numpy.__version__"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For the pieces of the package discussed here, I'd recommend NumPy version 1.8 or later.\n",
"By convention, you'll find that most people in the SciPy/PyData world will import NumPy using `np` as an alias:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Throughout this chapter, and indeed the rest of the book, you'll find that this is the way we will import and use NumPy."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Reminder About Built-in Documentation\n",
"\n",
"As you read through this part of the book, don't forget that IPython gives you the ability to quickly explore the contents of a package (by using the tab completion feature), as well as the documentation of various functions (using the `?` character). For a refresher on these, refer back to [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb).\n",
"\n",
"For example, to display all the contents of the NumPy namespace, you can type this:\n",
"\n",
"```ipython\n",
"In [3]: np.<TAB>\n",
"```\n",
"\n",
"And to display NumPy's built-in documentation, you can use this:\n",
"\n",
"```ipython\n",
"In [4]: np?\n",
"```\n",
"\n",
"More detailed documentation, along with tutorials and other resources, can be found at http://www.numpy.org."
]
}
],
"metadata": {
"anaconda-cloud": {},
"jupytext": {
"formats": "ipynb,md"
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: notebooks/02.01-Understanding-Data-Types.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Understanding Data Types in Python"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Effective data-driven science and computation requires understanding how data is stored and manipulated.\n",
"This chapter outlines and contrasts how arrays of data are handled in the Python language itself, and how NumPy improves on this.\n",
"Understanding this difference is fundamental to understanding much of the material throughout the rest of the book.\n",
"\n",
"Users of Python are often drawn in by its ease of use, one piece of which is dynamic typing.\n",
"While a statically typed language like C or Java requires each variable to be explicitly declared, a dynamically typed language like Python skips this specification. For example, in C you might specify a particular operation as follows:\n",
"\n",
"```C\n",
"/* C code */\n",
"int result = 0;\n",
"for(int i=0; i<100; i++){\n",
" result += i;\n",
"}\n",
"```\n",
"\n",
"While in Python the equivalent operation could be written this way:\n",
"\n",
"```python\n",
"# Python code\n",
"result = 0\n",
"for i in range(100):\n",
" result += i\n",
"```\n",
"\n",
"Notice one main difference: in C, the data types of each variable are explicitly declared, while in Python the types are dynamically inferred. This means, for example, that we can assign any kind of data to any variable:\n",
"\n",
"```python\n",
"# Python code\n",
"x = 4\n",
"x = \"four\"\n",
"```\n",
"\n",
"Here we've switched the contents of `x` from an integer to a string. The same thing in C would lead (depending on compiler settings) to a compilation error or other unintended consequences:\n",
"\n",
"```C\n",
"/* C code */\n",
"int x = 4;\n",
"x = \"four\"; // FAILS\n",
"```\n",
"\n",
"This sort of flexibility is one element that makes Python and other dynamically typed languages convenient and easy to use.\n",
"Understanding *how* this works is an important piece of learning to analyze data efficiently and effectively with Python.\n",
"But what this type flexibility also points to is the fact that Python variables are more than just their values; they also contain extra information about the *type* of the value. We'll explore this more in the sections that follow."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## A Python Integer Is More Than Just an Integer\n",
"\n",
"The standard Python implementation is written in C.\n",
"This means that every Python object is simply a cleverly disguised C structure, which contains not only its value, but other information as well. For example, when we define an integer in Python, such as `x = 10000`, `x` is not just a \"raw\" integer. It's actually a pointer to a compound C structure, which contains several values.\n",
"Looking through the Python 3.10 source code, we find that the integer (long) type definition effectively looks like this (once the C macros are expanded):\n",
"\n",
"```C\n",
"struct _longobject {\n",
" long ob_refcnt;\n",
" PyTypeObject *ob_type;\n",
" size_t ob_size;\n",
" long ob_digit[1];\n",
"};\n",
"```\n",
"\n",
"A single integer in Python 3.10 actually contains four pieces:\n",
"\n",
"- `ob_refcnt`, a reference count that helps Python silently handle memory allocation and deallocation\n",
"- `ob_type`, which encodes the type of the variable\n",
"- `ob_size`, which specifies the size of the following data members\n",
"- `ob_digit`, which contains the actual integer value that we expect the Python variable to represent\n",
"\n",
"This means that there is some overhead involved in storing an integer in Python as compared to a compiled language like C, as illustrated in the following figure:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here, `PyObject_HEAD` is the part of the structure containing the reference count, type code, and other pieces mentioned before.\n",
"\n",
"Notice the difference here: a C integer is essentially a label for a position in memory whose bytes encode an integer value.\n",
"A Python integer is a pointer to a position in memory containing all the Python object information, including the bytes that contain the integer value.\n",
"This extra information in the Python integer structure is what allows Python to be coded so freely and dynamically.\n",
"All this additional information in Python types comes at a cost, however, which becomes especially apparent in structures that combine many of these objects."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## A Python List Is More Than Just a List\n",
"\n",
"Let's consider now what happens when we use a Python data structure that holds many Python objects.\n",
"The standard mutable multielement container in Python is the list.\n",
"We can create a list of integers as follows:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"L = list(range(10))\n",
"L"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"int"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(L[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Or, similarly, a list of strings:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"L2 = [str(c) for c in L]\n",
"L2"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"str"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(L2[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Because of Python's dynamic typing, we can even create heterogeneous lists:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"[bool, str, float, int]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"L3 = [True, \"2\", 3.0, 4]\n",
"[type(item) for item in L3]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"But this flexibility comes at a cost: to allow these flexible types, each item in the list must contain its own type, reference count, and other information. That is, each item is a complete Python object.\n",
"In the special case that all variables are of the same type, much of this information is redundant, so it can be much more efficient to store the data in a fixed-type array.\n",
"The difference between a dynamic-type list and a fixed-type (NumPy-style) array is illustrated in the following figure:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"At the implementation level, the array essentially contains a single pointer to one contiguous block of data.\n",
"The Python list, on the other hand, contains a pointer to a block of pointers, each of which in turn points to a full Python object like the Python integer we saw earlier.\n",
"Again, the advantage of the list is flexibility: because each list element is a full structure containing both data and type information, the list can be filled with data of any desired type.\n",
"Fixed-type NumPy-style arrays lack this flexibility, but are much more efficient for storing and manipulating data."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fixed-Type Arrays in Python\n",
"\n",
"Python offers several different options for storing data in efficient, fixed-type data buffers.\n",
"The built-in `array` module (available since Python 3.3) can be used to create dense arrays of a uniform type:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import array\n",
"L = list(range(10))\n",
"A = array.array('i', L)\n",
"A"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here, `'i'` is a type code indicating the contents are integers.\n",
"\n",
"Much more useful, however, is the `ndarray` object of the NumPy package.\n",
"While Python's `array` object provides efficient storage of array-based data, NumPy adds to this efficient *operations* on that data.\n",
"We will explore these operations in later chapters; next, I'll show you a few different ways of creating a NumPy array."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Creating Arrays from Python Lists\n",
"\n",
"We'll start with the standard NumPy import, under the alias `np`:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can use `np.array` to create arrays from Python lists:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 4, 2, 5, 3])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Integer array\n",
"np.array([1, 4, 2, 5, 3])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Remember that unlike Python lists, NumPy arrays can only contain data of the same type.\n",
"If the types do not match, NumPy will upcast them according to its type promotion rules; here, integers are upcast to floating point:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([3.14, 4. , 2. , 3. ])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.array([3.14, 4, 2, 3])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If we want to explicitly set the data type of the resulting array, we can use the `dtype` keyword:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([1., 2., 3., 4.], dtype=float32)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.array([1, 2, 3, 4], dtype=np.float32)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, unlike Python lists, which are always one-dimensional sequences, NumPy arrays can be multidimensional. Here's one way of initializing a multidimensional array using a list of lists:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[2, 3, 4],\n",
" [4, 5, 6],\n",
" [6, 7, 8]])"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Nested lists result in multidimensional arrays\n",
"np.array([range(i, i + 3) for i in [2, 4, 6]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The inner lists are treated as rows of the resulting two-dimensional array."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Creating Arrays from Scratch\n",
"\n",
"Especially for larger arrays, it is more efficient to create arrays from scratch using routines built into NumPy.\n",
"Here are several examples:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create a length-10 integer array filled with 0s\n",
"np.zeros(10, dtype=int)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1., 1., 1., 1., 1.],\n",
" [1., 1., 1., 1., 1.],\n",
" [1., 1., 1., 1., 1.]])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create a 3x5 floating-point array filled with 1s\n",
"np.ones((3, 5), dtype=float)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[3.14, 3.14, 3.14, 3.14, 3.14],\n",
" [3.14, 3.14, 3.14, 3.14, 3.14],\n",
" [3.14, 3.14, 3.14, 3.14, 3.14]])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create a 3x5 array filled with 3.14\n",
"np.full((3, 5), 3.14)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18])"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create an array filled with a linear sequence\n",
"# starting at 0, ending at 20, stepping by 2\n",
"# (this is similar to the built-in range function)\n",
"np.arange(0, 20, 2)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([0. , 0.25, 0.5 , 0.75, 1. ])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create an array of five values evenly spaced between 0 and 1\n",
"np.linspace(0, 1, 5)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.09610171, 0.88193001, 0.70548015],\n",
" [0.35885395, 0.91670468, 0.8721031 ],\n",
" [0.73237865, 0.09708562, 0.52506779]])"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create a 3x3 array of uniformly distributed\n",
"# pseudorandom values between 0 and 1\n",
"np.random.random((3, 3))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[-0.46652655, -0.59158776, -1.05392451],\n",
" [-1.72634268, 0.03194069, -0.51048869],\n",
" [ 1.41240208, 1.77734462, -0.43820037]])"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create a 3x3 array of normally distributed pseudorandom\n",
"# values with mean 0 and standard deviation 1\n",
"np.random.normal(0, 1, (3, 3))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[4, 3, 8],\n",
" [6, 5, 0],\n",
" [1, 1, 4]])"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create a 3x3 array of pseudorandom integers in the interval [0, 10)\n",
"np.random.randint(0, 10, (3, 3))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1., 0., 0.],\n",
" [0., 1., 0.],\n",
" [0., 0., 1.]])"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create a 3x3 identity matrix\n",
"np.eye(3)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([1., 1., 1.])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create an uninitialized array of three integers; the values will be\n",
"# whatever happens to already exist at that memory location\n",
"np.empty(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## NumPy Standard Data Types\n",
"\n",
"NumPy arrays contain values of a single type, so it is important to have detailed knowledge of those types and their limitations.\n",
"Because NumPy is built in C, the types will be familiar to users of C, Fortran, and other related languages.\n",
"\n",
"The standard NumPy data types are listed in the following table.\n",
"Note that when constructing an array, they can be specified using a string:\n",
"\n",
"```python\n",
"np.zeros(10, dtype='int16')\n",
"```\n",
"\n",
"Or using the associated NumPy object:\n",
"\n",
"```python\n",
"np.zeros(10, dtype=np.int16)\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"| Data type\t | Description |\n",
"|-------------|-------------|\n",
"| `bool_` | Boolean (True or False) stored as a byte |\n",
"| `int_` | Default integer type (same as C `long`; normally either `int64` or `int32`)| \n",
"| `intc` | Identical to C `int` (normally `int32` or `int64`)| \n",
"| `intp` | Integer used for indexing (same as C `ssize_t`; normally either `int32` or `int64`)| \n",
"| `int8` | Byte (–128 to 127)| \n",
"| `int16` | Integer (–32768 to 32767)|\n",
"| `int32` | Integer (–2147483648 to 2147483647)|\n",
"| `int64` | Integer (–9223372036854775808 to 9223372036854775807)| \n",
"| `uint8` | Unsigned integer (0 to 255)| \n",
"| `uint16` | Unsigned integer (0 to 65535)| \n",
"| `uint32` | Unsigned integer (0 to 4294967295)| \n",
"| `uint64` | Unsigned integer (0 to 18446744073709551615)| \n",
"| `float_` | Shorthand for `float64`| \n",
"| `float16` | Half-precision float: sign bit, 5 bits exponent, 10 bits mantissa| \n",
"| `float32` | Single-precision float: sign bit, 8 bits exponent, 23 bits mantissa| \n",
"| `float64` | Double-precision float: sign bit, 11 bits exponent, 52 bits mantissa| \n",
"| `complex_` | Shorthand for `complex128`| \n",
"| `complex64` | Complex number, represented by two 32-bit floats| \n",
"| `complex128`| Complex number, represented by two 64-bit floats| "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"More advanced type specification is possible, such as specifying big- or little-endian numbers; for more information, refer to the [NumPy documentation](http://numpy.org/).\n",
"NumPy also supports compound data types, which will be covered in [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb)."
]
}
],
"metadata": {
"anaconda-cloud": {},
"jupytext": {
"formats": "ipynb,md"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: notebooks/02.02-The-Basics-Of-NumPy-Arrays.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# The Basics of NumPy Arrays"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Data manipulation in Python is nearly synonymous with NumPy array manipulation: even newer tools like Pandas ([Part 3](03.00-Introduction-to-Pandas.ipynb)) are built around the NumPy array.\n",
"This chapter will present several examples of using NumPy array manipulation to access data and subarrays, and to split, reshape, and join the arrays.\n",
"While the types of operations shown here may seem a bit dry and pedantic, they comprise the building blocks of many other examples used throughout the book.\n",
"Get to know them well!\n",
"\n",
"We'll cover a few categories of basic array manipulations here:\n",
"\n",
"- *Attributes of arrays*: Determining the size, shape, memory consumption, and data types of arrays\n",
"- *Indexing of arrays*: Getting and setting the values of individual array elements\n",
"- *Slicing of arrays*: Getting and setting smaller subarrays within a larger array\n",
"- *Reshaping of arrays*: Changing the shape of a given array\n",
"- *Joining and splitting of arrays*: Combining multiple arrays into one, and splitting one array into many"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## NumPy Array Attributes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First let's discuss some useful array attributes.\n",
"We'll start by defining random arrays of one, two, and three dimensions.\n",
"We'll use NumPy's random number generator, which we will *seed* with a set value in order to ensure that the same random arrays are generated each time this code is run:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"rng = np.random.default_rng(seed=1701) # seed for reproducibility\n",
"\n",
"x1 = rng.integers(10, size=6) # one-dimensional array\n",
"x2 = rng.integers(10, size=(3, 4)) # two-dimensional array\n",
"x3 = rng.integers(10, size=(3, 4, 5)) # three-dimensional array"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Each array has attributes including `ndim` (the number of dimensions), `shape` (the size of each dimension), `size` (the total size of the array), and `dtype` (the type of each element):"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"x3 ndim: 3\n",
"x3 shape: (3, 4, 5)\n",
"x3 size: 60\n",
"dtype: int64\n"
]
}
],
"source": [
"print(\"x3 ndim: \", x3.ndim)\n",
"print(\"x3 shape:\", x3.shape)\n",
"print(\"x3 size: \", x3.size)\n",
"print(\"dtype: \", x3.dtype)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For more discussion of data types, see [Understanding Data Types in Python](02.01-Understanding-Data-Types.ipynb)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Array Indexing: Accessing Single Elements"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you are familiar with Python's standard list indexing, indexing in NumPy will feel quite familiar.\n",
"In a one-dimensional array, the $i^{th}$ value (counting from zero) can be accessed by specifying the desired index in square brackets, just as with Python lists:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([9, 4, 0, 3, 8, 6])"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"9"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1[0]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"8"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1[4]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To index from the end of the array, you can use negative indices:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"6"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1[-1]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"8"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1[-2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In a multidimensional array, items can be accessed using a comma-separated `(row, column)` tuple:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[3, 1, 3, 7],\n",
" [4, 0, 2, 3],\n",
" [0, 0, 6, 9]])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x2"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x2[0, 0]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x2[2, 0]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"9"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x2[2, -1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Values can also be modified using any of the preceding index notation:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[12, 1, 3, 7],\n",
" [ 4, 0, 2, 3],\n",
" [ 0, 0, 6, 9]])"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x2[0, 0] = 12\n",
"x2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Keep in mind that, unlike Python lists, NumPy arrays have a fixed type.\n",
"This means, for example, that if you attempt to insert a floating-point value into an integer array, the value will be silently truncated. Don't be caught unaware by this behavior!"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([3, 4, 0, 3, 8, 6])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1[0] = 3.14159 # this will be truncated!\n",
"x1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Array Slicing: Accessing Subarrays"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Just as we can use square brackets to access individual array elements, we can also use them to access subarrays with the *slice* notation, marked by the colon (`:`) character.\n",
"The NumPy slicing syntax follows that of the standard Python list; to access a slice of an array `x`, use this:\n",
"``` python\n",
"x[start:stop:step]\n",
"```\n",
"If any of these are unspecified, they default to the values `start=0`, `stop=<size of dimension>`, `step=1`.\n",
"Let's look at some examples of accessing subarrays in one dimension and in multiple dimensions."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### One-Dimensional Subarrays\n",
"\n",
"Here are some examples of accessing elements in one-dimensional subarrays:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([3, 4, 0, 3, 8, 6])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([3, 4, 0])"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1[:3] # first three elements"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([3, 8, 6])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1[3:] # elements after index 3"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([4, 0, 3])"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1[1:4] # middle subarray"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([3, 0, 8])"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1[::2] # every second element"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([4, 3, 6])"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1[1::2] # every second element, starting at index 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A potentially confusing case is when the `step` value is negative.\n",
"In this case, the defaults for `start` and `stop` are swapped.\n",
"This becomes a convenient way to reverse an array:"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([6, 8, 3, 0, 4, 3])"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1[::-1] # all elements, reversed"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([8, 0, 3])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x1[4::-2] # every second element from index 4, reversed"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Multidimensional Subarrays\n",
"\n",
"Multidimensional slices work in the same way, with multiple slices separated by commas.\n",
"For example:"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[12, 1, 3, 7],\n",
" [ 4, 0, 2, 3],\n",
" [ 0, 0, 6, 9]])"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x2"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[12, 1, 3],\n",
" [ 4, 0, 2]])"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x2[:2, :3] # first two rows & three columns"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[12, 3],\n",
" [ 4, 2],\n",
" [ 0, 6]])"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x2[:3, ::2] # three rows, every second column"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 9, 6, 0, 0],\n",
" [ 3, 2, 0, 4],\n",
" [ 7, 3, 1, 12]])"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x2[::-1, ::-1] # all rows & columns, reversed"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Accessing array rows and columns\n",
"\n",
"One commonly needed routine is accessing single rows or columns of an array.\n",
"This can be done by combining indexing and slicing, using an empty slice marked by a single colon (`:`):"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([12, 4, 0])"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x2[:, 0] # first column of x2"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([12, 1, 3, 7])"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x2[0, :] # first row of x2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the case of row access, the empty slice can be omitted for a more compact syntax:"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([12, 1, 3, 7])"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x2[0] # equivalent to x2[0, :]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Subarrays as No-Copy Views\n",
"\n",
"Unlike Python list slices, NumPy array slices are returned as *views* rather than *copies* of the array data.\n",
"Consider our two-dimensional array from before:"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[12 1 3 7]\n",
" [ 4 0 2 3]\n",
" [ 0 0 6 9]]\n"
]
}
],
"source": [
"print(x2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's extract a $2 \\times 2$ subarray from this:"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[12 1]\n",
" [ 4 0]]\n"
]
}
],
"source": [
"x2_sub = x2[:2, :2]\n",
"print(x2_sub)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now if we modify this subarray, we'll see that the original array is changed! Observe:"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[99 1]\n",
" [ 4 0]]\n"
]
}
],
"source": [
"x2_sub[0, 0] = 99\n",
"print(x2_sub)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[99 1 3 7]\n",
" [ 4 0 2 3]\n",
" [ 0 0 6 9]]\n"
]
}
],
"source": [
"print(x2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Some users may find this surprising, but it can be advantageous: for example, when working with large datasets, we can access and process pieces of these datasets without the need to copy the underlying data buffer."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Creating Copies of Arrays\n",
"\n",
"Despite the nice features of array views, it is sometimes useful to instead explicitly copy the data within an array or a subarray. This can be most easily done with the `copy` method:"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[99 1]\n",
" [ 4 0]]\n"
]
}
],
"source": [
"x2_sub_copy = x2[:2, :2].copy()\n",
"print(x2_sub_copy)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If we now modify this subarray, the original array is not touched:"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[42 1]\n",
" [ 4 0]]\n"
]
}
],
"source": [
"x2_sub_copy[0, 0] = 42\n",
"print(x2_sub_copy)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[99 1 3 7]\n",
" [ 4 0 2 3]\n",
" [ 0 0 6 9]]\n"
]
}
],
"source": [
"print(x2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Reshaping of Arrays\n",
"\n",
"Another useful type of operation is reshaping of arrays, which can be done with the `reshape` method.\n",
"For example, if you want to put the numbers 1 through 9 in a $3 \\times 3$ grid, you can do the following:"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[1 2 3]\n",
" [4 5 6]\n",
" [7 8 9]]\n"
]
}
],
"source": [
"grid = np.arange(1, 10).reshape(3, 3)\n",
"print(grid)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that for this to work, the size of the initial array must match the size of the reshaped array, and in most cases the `reshape` method will return a no-copy view of the initial array."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A common reshaping operation is converting a one-dimensional array into a two-dimensional row or column matrix:"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 2, 3]])"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = np.array([1, 2, 3])\n",
"x.reshape((1, 3)) # row vector via reshape"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1],\n",
" [2],\n",
" [3]])"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x.reshape((3, 1)) # column vector via reshape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A convenient shorthand for this is to use `np.newaxis` in the slicing syntax:"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 2, 3]])"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x[np.newaxis, :] # row vector via newaxis"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1],\n",
" [2],\n",
" [3]])"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x[:, np.newaxis] # column vector via newaxis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is a pattern that we will utilize often throughout the remainder of the book."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Array Concatenation and Splitting\n",
"\n",
"All of the preceding routines worked on single arrays. NumPy also provides tools to combine multiple arrays into one, and to conversely split a single array into multiple arrays."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Concatenation of Arrays\n",
"\n",
"Concatenation, or joining of two arrays in NumPy, is primarily accomplished using the routines `np.concatenate`, `np.vstack`, and `np.hstack`.\n",
"`np.concatenate` takes a tuple or list of arrays as its first argument, as you can see here:"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 2, 3, 3, 2, 1])"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = np.array([1, 2, 3])\n",
"y = np.array([3, 2, 1])\n",
"np.concatenate([x, y])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can also concatenate more than two arrays at once:"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 1 2 3 3 2 1 99 99 99]\n"
]
}
],
"source": [
"z = np.array([99, 99, 99])\n",
"print(np.concatenate([x, y, z]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"And it can be used for two-dimensional arrays:"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"grid = np.array([[1, 2, 3],\n",
" [4, 5, 6]])"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 2, 3],\n",
" [4, 5, 6],\n",
" [1, 2, 3],\n",
" [4, 5, 6]])"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# concatenate along the first axis\n",
"np.concatenate([grid, grid])"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 2, 3, 1, 2, 3],\n",
" [4, 5, 6, 4, 5, 6]])"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# concatenate along the second axis (zero-indexed)\n",
"np.concatenate([grid, grid], axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For working with arrays of mixed dimensions, it can be clearer to use the `np.vstack` (vertical stack) and `np.hstack` (horizontal stack) functions:"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 2, 3],\n",
" [1, 2, 3],\n",
" [4, 5, 6]])"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# vertically stack the arrays\n",
"np.vstack([x, grid])"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 1, 2, 3, 99],\n",
" [ 4, 5, 6, 99]])"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# horizontally stack the arrays\n",
"y = np.array([[99],\n",
" [99]])\n",
"np.hstack([grid, y])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Similarly, for higher-dimensional arrays, `np.dstack` will stack arrays along the third axis."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Splitting of Arrays\n",
"\n",
"The opposite of concatenation is splitting, which is implemented by the functions `np.split`, `np.hsplit`, and `np.vsplit`. For each of these, we can pass a list of indices giving the split points:"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1 2 3] [99 99] [3 2 1]\n"
]
}
],
"source": [
"x = [1, 2, 3, 99, 99, 3, 2, 1]\n",
"x1, x2, x3 = np.split(x, [3, 5])\n",
"print(x1, x2, x3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Notice that *N* split points leads to *N* + 1 subarrays.\n",
"The related functions `np.hsplit` and `np.vsplit` are similar:"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0, 1, 2, 3],\n",
" [ 4, 5, 6, 7],\n",
" [ 8, 9, 10, 11],\n",
" [12, 13, 14, 15]])"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grid = np.arange(16).reshape((4, 4))\n",
"grid"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0 1 2 3]\n",
" [4 5 6 7]]\n",
"[[ 8 9 10 11]\n",
" [12 13 14 15]]\n"
]
}
],
"source": [
"upper, lower = np.vsplit(grid, [2])\n",
"print(upper)\n",
"print(lower)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0 1]\n",
" [ 4 5]\n",
" [ 8 9]\n",
" [12 13]]\n",
"[[ 2 3]\n",
" [ 6 7]\n",
" [10 11]\n",
" [14 15]]\n"
]
}
],
"source": [
"left, right = np.hsplit(grid, [2])\n",
"print(left)\n",
"print(right)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Similarly, for higher-dimensional arrays, `np.dsplit` will split arrays along the third axis."
]
}
],
"metadata": {
"anaconda-cloud": {},
"jupytext": {
"formats": "ipynb,md"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: notebooks/02.03-Computation-on-arrays-ufuncs.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Computation on NumPy Arrays: Universal Functions"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Up until now, we have been discussing some of the basic nuts and bolts of NumPy. In the next few chapters, we will dive into the reasons that NumPy is so important in the Python data science world: namely, because it provides an easy and flexible interface to optimize computation with arrays of data.\n",
"\n",
"Computation on NumPy arrays can be very fast, or it can be very slow.\n",
"The key to making it fast is to use vectorized operations, generally implemented through NumPy's *universal functions* (ufuncs).\n",
"This chapter motivates the need for NumPy's ufuncs, which can be used to make repeated calculations on array elements much more efficient.\n",
"It then introduces many of the most common and useful arithmetic ufuncs available in the NumPy package."
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## The Slowness of Loops\n",
"\n",
"Python's default implementation (known as CPython) does some operations very slowly.\n",
"This is partly due to the dynamic, interpreted nature of the language; types are flexible, so sequences of operations cannot be compiled down to efficient machine code as in languages like C and Fortran.\n",
"Recently there have been various attempts to address this weakness: well-known examples are the [PyPy project](http://pypy.org/), a just-in-time compiled implementation of Python; the [Cython project](http://cython.org), which converts Python code to compilable C code; and the [Numba project](http://numba.pydata.org/), which converts snippets of Python code to fast LLVM bytecode.\n",
"Each of these has its strengths and weaknesses, but it is safe to say that none of the three approaches has yet surpassed the reach and popularity of the standard CPython engine.\n",
"\n",
"The relative sluggishness of Python generally manifests itself in situations where many small operations are being repeated; for instance, looping over arrays to operate on each element.\n",
"For example, imagine we have an array of values and we'd like to compute the reciprocal of each.\n",
"A straightforward approach might look like this:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([0.11111111, 0.25 , 1. , 0.33333333, 0.125 ])"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"rng = np.random.default_rng(seed=1701)\n",
"\n",
"def compute_reciprocals(values):\n",
" output = np.empty(len(values))\n",
" for i in range(len(values)):\n",
" output[i] = 1.0 / values[i]\n",
" return output\n",
" \n",
"values = rng.integers(1, 10, size=5)\n",
"compute_reciprocals(values)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This implementation probably feels fairly natural to someone from, say, a C or Java background.\n",
"But if we measure the execution time of this code for a large input, we see that this operation is very slow—perhaps surprisingly so!\n",
"We'll benchmark this with IPython's `%timeit` magic (discussed in [Profiling and Timing Code](01.07-Timing-and-Profiling.ipynb)):"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.61 s ± 192 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"big_array = rng.integers(1, 100, size=1000000)\n",
"%timeit compute_reciprocals(big_array)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It takes several seconds to compute these million operations and to store the result!\n",
"When even cell phones have processing speeds measured in gigaflops (i.e., billions of numerical operations per second), this seems almost absurdly slow.\n",
"It turns out that the bottleneck here is not the operations themselves, but the type checking and function dispatches that CPython must do at each cycle of the loop.\n",
"Each time the reciprocal is computed, Python first examines the object's type and does a dynamic lookup of the correct function to use for that type.\n",
"If we were working in compiled code instead, this type specification would be known before the code executed and the result could be computed much more efficiently."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introducing Ufuncs\n",
"\n",
"For many types of operations, NumPy provides a convenient interface into just this kind of statically typed, compiled routine. This is known as a *vectorized* operation.\n",
"For simple operations like the element-wise division here, vectorization is as simple as using Python arithmetic operators directly on the array object.\n",
"This vectorized approach is designed to push the loop into the compiled layer that underlies NumPy, leading to much faster execution.\n",
"\n",
"Compare the results of the following two operations:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0.11111111 0.25 1. 0.33333333 0.125 ]\n",
"[0.11111111 0.25 1. 0.33333333 0.125 ]\n"
]
}
],
"source": [
"print(compute_reciprocals(values))\n",
"print(1.0 / values)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Looking at the execution time for our big array, we see that it completes orders of magnitude faster than the Python loop:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.54 ms ± 383 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%timeit (1.0 / big_array)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Vectorized operations in NumPy are implemented via ufuncs, whose main purpose is to quickly execute repeated operations on values in NumPy arrays.\n",
"Ufuncs are extremely flexible—before we saw an operation between a scalar and an array, but we can also operate between two arrays:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([0. , 0.5 , 0.66666667, 0.75 , 0.8 ])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.arange(5) / np.arange(1, 6)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"And ufunc operations are not limited to one-dimensional arrays. They can act on multidimensional arrays as well:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 1, 2, 4],\n",
" [ 8, 16, 32],\n",
" [ 64, 128, 256]])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = np.arange(9).reshape((3, 3))\n",
"2 ** x"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Computations using vectorization through ufuncs are nearly always more efficient than their counterparts implemented using Python loops, especially as the arrays grow in size.\n",
"Any time you see such a loop in a NumPy script, you should consider whether it can be replaced with a vectorized expression."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Exploring NumPy's Ufuncs\n",
"\n",
"Ufuncs exist in two flavors: *unary ufuncs*, which operate on a single input, and *binary ufuncs*, which operate on two inputs.\n",
"We'll see examples of both these types of functions here."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Array Arithmetic\n",
"\n",
"NumPy's ufuncs feel very natural to use because they make use of Python's native arithmetic operators.\n",
"The standard addition, subtraction, multiplication, and division can all be used:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"x = [0 1 2 3]\n",
"x + 5 = [5 6 7 8]\n",
"x - 5 = [-5 -4 -3 -2]\n",
"x * 2 = [0 2 4 6]\n",
"x / 2 = [0. 0.5 1. 1.5]\n",
"x // 2 = [0 0 1 1]\n"
]
}
],
"source": [
"x = np.arange(4)\n",
"print(\"x =\", x)\n",
"print(\"x + 5 =\", x + 5)\n",
"print(\"x - 5 =\", x - 5)\n",
"print(\"x * 2 =\", x * 2)\n",
"print(\"x / 2 =\", x / 2)\n",
"print(\"x // 2 =\", x // 2) # floor division"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"There is also a unary ufunc for negation, a `**` operator for exponentiation, and a `%` operator for modulus:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-x = [ 0 -1 -2 -3]\n",
"x ** 2 = [
gitextract_10clvamz/
├── .gitignore
├── .gitmodules
├── LICENSE-CODE
├── LICENSE-TEXT
├── README.md
├── environment.yml
├── notebooks/
│ ├── 00.00-Preface.ipynb
│ ├── 01.00-IPython-Beyond-Normal-Python.ipynb
│ ├── 01.01-Help-And-Documentation.ipynb
│ ├── 01.02-Shell-Keyboard-Shortcuts.ipynb
│ ├── 01.03-Magic-Commands.ipynb
│ ├── 01.04-Input-Output-History.ipynb
│ ├── 01.05-IPython-And-Shell-Commands.ipynb
│ ├── 01.06-Errors-and-Debugging.ipynb
│ ├── 01.07-Timing-and-Profiling.ipynb
│ ├── 01.08-More-IPython-Resources.ipynb
│ ├── 02.00-Introduction-to-NumPy.ipynb
│ ├── 02.01-Understanding-Data-Types.ipynb
│ ├── 02.02-The-Basics-Of-NumPy-Arrays.ipynb
│ ├── 02.03-Computation-on-arrays-ufuncs.ipynb
│ ├── 02.04-Computation-on-arrays-aggregates.ipynb
│ ├── 02.05-Computation-on-arrays-broadcasting.ipynb
│ ├── 02.06-Boolean-Arrays-and-Masks.ipynb
│ ├── 02.07-Fancy-Indexing.ipynb
│ ├── 02.08-Sorting.ipynb
│ ├── 02.09-Structured-Data-NumPy.ipynb
│ ├── 03.00-Introduction-to-Pandas.ipynb
│ ├── 03.01-Introducing-Pandas-Objects.ipynb
│ ├── 03.02-Data-Indexing-and-Selection.ipynb
│ ├── 03.03-Operations-in-Pandas.ipynb
│ ├── 03.04-Missing-Values.ipynb
│ ├── 03.05-Hierarchical-Indexing.ipynb
│ ├── 03.06-Concat-And-Append.ipynb
│ ├── 03.07-Merge-and-Join.ipynb
│ ├── 03.08-Aggregation-and-Grouping.ipynb
│ ├── 03.09-Pivot-Tables.ipynb
│ ├── 03.10-Working-With-Strings.ipynb
│ ├── 03.11-Working-with-Time-Series.ipynb
│ ├── 03.12-Performance-Eval-and-Query.ipynb
│ ├── 03.13-Further-Resources.ipynb
│ ├── 04.00-Introduction-To-Matplotlib.ipynb
│ ├── 04.01-Simple-Line-Plots.ipynb
│ ├── 04.02-Simple-Scatter-Plots.ipynb
│ ├── 04.03-Errorbars.ipynb
│ ├── 04.04-Density-and-Contour-Plots.ipynb
│ ├── 04.05-Histograms-and-Binnings.ipynb
│ ├── 04.06-Customizing-Legends.ipynb
│ ├── 04.07-Customizing-Colorbars.ipynb
│ ├── 04.08-Multiple-Subplots.ipynb
│ ├── 04.09-Text-and-Annotation.ipynb
│ ├── 04.10-Customizing-Ticks.ipynb
│ ├── 04.11-Settings-and-Stylesheets.ipynb
│ ├── 04.12-Three-Dimensional-Plotting.ipynb
│ ├── 04.14-Visualization-With-Seaborn.ipynb
│ ├── 04.15-Further-Resources.ipynb
│ ├── 05.00-Machine-Learning.ipynb
│ ├── 05.01-What-Is-Machine-Learning.ipynb
│ ├── 05.02-Introducing-Scikit-Learn.ipynb
│ ├── 05.03-Hyperparameters-and-Model-Validation.ipynb
│ ├── 05.04-Feature-Engineering.ipynb
│ ├── 05.05-Naive-Bayes.ipynb
│ ├── 05.06-Linear-Regression.ipynb
│ ├── 05.07-Support-Vector-Machines.ipynb
│ ├── 05.08-Random-Forests.ipynb
│ ├── 05.09-Principal-Component-Analysis.ipynb
│ ├── 05.10-Manifold-Learning.ipynb
│ ├── 05.11-K-Means.ipynb
│ ├── 05.12-Gaussian-Mixtures.ipynb
│ ├── 05.13-Kernel-Density-Estimation.ipynb
│ ├── 05.14-Image-Features.ipynb
│ ├── 05.15-Learning-More.ipynb
│ ├── 06.00-Figure-Code.ipynb
│ ├── Untitled.ipynb
│ ├── data/
│ │ ├── births.csv
│ │ ├── president_heights.csv
│ │ ├── state-abbrevs.csv
│ │ ├── state-areas.csv
│ │ └── state-population.csv
│ └── helpers_05_08.py
├── notebooks_v1/
│ ├── 00.00-Preface.ipynb
│ ├── 01.00-IPython-Beyond-Normal-Python.ipynb
│ ├── 01.01-Help-And-Documentation.ipynb
│ ├── 01.02-Shell-Keyboard-Shortcuts.ipynb
│ ├── 01.03-Magic-Commands.ipynb
│ ├── 01.04-Input-Output-History.ipynb
│ ├── 01.05-IPython-And-Shell-Commands.ipynb
│ ├── 01.06-Errors-and-Debugging.ipynb
│ ├── 01.07-Timing-and-Profiling.ipynb
│ ├── 01.08-More-IPython-Resources.ipynb
│ ├── 02.00-Introduction-to-NumPy.ipynb
│ ├── 02.01-Understanding-Data-Types.ipynb
│ ├── 02.02-The-Basics-Of-NumPy-Arrays.ipynb
│ ├── 02.03-Computation-on-arrays-ufuncs.ipynb
│ ├── 02.04-Computation-on-arrays-aggregates.ipynb
│ ├── 02.05-Computation-on-arrays-broadcasting.ipynb
│ ├── 02.06-Boolean-Arrays-and-Masks.ipynb
│ ├── 02.07-Fancy-Indexing.ipynb
│ ├── 02.08-Sorting.ipynb
│ ├── 02.09-Structured-Data-NumPy.ipynb
│ ├── 03.00-Introduction-to-Pandas.ipynb
│ ├── 03.01-Introducing-Pandas-Objects.ipynb
│ ├── 03.02-Data-Indexing-and-Selection.ipynb
│ ├── 03.03-Operations-in-Pandas.ipynb
│ ├── 03.04-Missing-Values.ipynb
│ ├── 03.05-Hierarchical-Indexing.ipynb
│ ├── 03.06-Concat-And-Append.ipynb
│ ├── 03.07-Merge-and-Join.ipynb
│ ├── 03.08-Aggregation-and-Grouping.ipynb
│ ├── 03.09-Pivot-Tables.ipynb
│ ├── 03.10-Working-With-Strings.ipynb
│ ├── 03.11-Working-with-Time-Series.ipynb
│ ├── 03.12-Performance-Eval-and-Query.ipynb
│ ├── 03.13-Further-Resources.ipynb
│ ├── 04.00-Introduction-To-Matplotlib.ipynb
│ ├── 04.01-Simple-Line-Plots.ipynb
│ ├── 04.02-Simple-Scatter-Plots.ipynb
│ ├── 04.03-Errorbars.ipynb
│ ├── 04.04-Density-and-Contour-Plots.ipynb
│ ├── 04.05-Histograms-and-Binnings.ipynb
│ ├── 04.06-Customizing-Legends.ipynb
│ ├── 04.07-Customizing-Colorbars.ipynb
│ ├── 04.08-Multiple-Subplots.ipynb
│ ├── 04.09-Text-and-Annotation.ipynb
│ ├── 04.10-Customizing-Ticks.ipynb
│ ├── 04.11-Settings-and-Stylesheets.ipynb
│ ├── 04.12-Three-Dimensional-Plotting.ipynb
│ ├── 04.13-Geographic-Data-With-Basemap.ipynb
│ ├── 04.14-Visualization-With-Seaborn.ipynb
│ ├── 04.15-Further-Resources.ipynb
│ ├── 05.00-Machine-Learning.ipynb
│ ├── 05.01-What-Is-Machine-Learning.ipynb
│ ├── 05.02-Introducing-Scikit-Learn.ipynb
│ ├── 05.03-Hyperparameters-and-Model-Validation.ipynb
│ ├── 05.04-Feature-Engineering.ipynb
│ ├── 05.05-Naive-Bayes.ipynb
│ ├── 05.06-Linear-Regression.ipynb
│ ├── 05.07-Support-Vector-Machines.ipynb
│ ├── 05.08-Random-Forests.ipynb
│ ├── 05.09-Principal-Component-Analysis.ipynb
│ ├── 05.10-Manifold-Learning.ipynb
│ ├── 05.11-K-Means.ipynb
│ ├── 05.12-Gaussian-Mixtures.ipynb
│ ├── 05.13-Kernel-Density-Estimation.ipynb
│ ├── 05.14-Image-Features.ipynb
│ ├── 05.15-Learning-More.ipynb
│ ├── 06.00-Figure-Code.ipynb
│ ├── Index.ipynb
│ ├── Untitled.ipynb
│ ├── data/
│ │ ├── BicycleWeather.csv
│ │ ├── Seattle2014.csv
│ │ ├── births.csv
│ │ ├── california_cities.csv
│ │ ├── president_heights.csv
│ │ ├── state-abbrevs.csv
│ │ ├── state-areas.csv
│ │ └── state-population.csv
│ └── helpers_05_08.py
├── notebooks_v2/
│ └── data.csv
├── requirements.txt
├── tools/
│ ├── README.md
│ ├── add_book_info.py
│ ├── add_navigation.py
│ ├── fix_kernelspec.py
│ └── generate_contents.py
└── website/
├── .gitignore
├── Makefile
├── README.md
├── copy_notebooks.py
├── fabfile.py
├── pelicanconf.py
├── publishconf.py
└── theme/
├── README.md
├── static/
│ └── css/
│ └── icons.css
└── templates/
├── _includes/
│ ├── analytics.html
│ └── disqus_thread.html
├── about.html
├── archives.html
├── article.html
├── base.html
├── booksection.html
├── index.html
├── ipynb.css
├── main.css
├── main.less
├── page.html
├── pygments.css
└── tag.html
SYMBOL INDEX (3 symbols across 1 files) FILE: notebooks/helpers_05_08.py function visualize_tree (line 8) | def visualize_tree(estimator, X, y, boundaries=True, function plot_tree_interactive (line 60) | def plot_tree_interactive(X, y): function randomized_tree_interactive (line 68) | def randomized_tree_interactive(X, y):
Copy disabled (too large)
Download .json
Condensed preview — 187 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (16,468K chars).
[
{
"path": ".gitignore",
"chars": 1240,
"preview": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packagi"
},
{
"path": ".gitmodules",
"chars": 259,
"preview": "[submodule \"website/plugins/ipynb\"]\n\tpath = website/plugins/ipynb\n\turl = git://github.com/danielfrg/pelican-ipynb.git\n[s"
},
{
"path": "LICENSE-CODE",
"chars": 1083,
"preview": "The MIT License (MIT)\n\nCopyright (c) 2016 Jacob VanderPlas\n\nPermission is hereby granted, free of charge, to any person "
},
{
"path": "LICENSE-TEXT",
"chars": 18650,
"preview": "Creative Commons Legal Code\n\nAttribution-NonCommercial-NoDerivs 3.0 Unported\n\n CREATIVE COMMONS CORPORATION IS NOT A "
},
{
"path": "README.md",
"chars": 3699,
"preview": "# Python Data Science Handbook\n\n[](https://mybinder.org/v2/gh/jakevdp/PythonDat"
},
{
"path": "environment.yml",
"chars": 117,
"preview": "name: data-science-handbook\nchannels:\n - conda-forge\ndependencies:\n - python=3.5\n - pip:\n - -r requirements.txt"
},
{
"path": "notebooks/00.00-Preface.ipynb",
"chars": 11787,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Preface\"\n ]\n },\n {\n \"cell_t"
},
{
"path": "notebooks/01.00-IPython-Beyond-Normal-Python.ipynb",
"chars": 2562,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Jupyter: Beyond Normal Python\"\n "
},
{
"path": "notebooks/01.01-Help-And-Documentation.ipynb",
"chars": 17408,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Getting Started in IPython and Ju"
},
{
"path": "notebooks/01.02-Shell-Keyboard-Shortcuts.ipynb",
"chars": 8383,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Keyboard Shortcuts in the IPython"
},
{
"path": "notebooks/01.03-Magic-Commands.ipynb",
"chars": 5813,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# IPython Magic Commands\"\n ]\n },"
},
{
"path": "notebooks/01.04-Input-Output-History.ipynb",
"chars": 7118,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Input and Output History\"\n ]\n "
},
{
"path": "notebooks/01.05-IPython-And-Shell-Commands.ipynb",
"chars": 9592,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# IPython and Shell Commands\"\n ]\n"
},
{
"path": "notebooks/01.06-Errors-and-Debugging.ipynb",
"chars": 17003,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Errors and Debugging\"\n ]\n },\n "
},
{
"path": "notebooks/01.07-Timing-and-Profiling.ipynb",
"chars": 17126,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Profiling and Timing Code\"\n ]\n "
},
{
"path": "notebooks/01.08-More-IPython-Resources.ipynb",
"chars": 3656,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# More IPython Resources\"\n ]\n },"
},
{
"path": "notebooks/02.00-Introduction-to-NumPy.ipynb",
"chars": 5371,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Introduction to NumPy\\n\",\n \"\\n"
},
{
"path": "notebooks/02.01-Understanding-Data-Types.ipynb",
"chars": 22956,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Understanding Data Types in Pytho"
},
{
"path": "notebooks/02.02-The-Basics-Of-NumPy-Arrays.ipynb",
"chars": 32798,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# The Basics of NumPy Arrays\"\n ]\n"
},
{
"path": "notebooks/02.03-Computation-on-arrays-ufuncs.ipynb",
"chars": 31234,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Computation on NumPy Arrays: Univ"
},
{
"path": "notebooks/02.04-Computation-on-arrays-aggregates.ipynb",
"chars": 27788,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Aggregations: min, max, and Every"
},
{
"path": "notebooks/02.05-Computation-on-arrays-broadcasting.ipynb",
"chars": 40728,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Computation on Arrays: Broadcasti"
},
{
"path": "notebooks/02.06-Boolean-Arrays-and-Masks.ipynb",
"chars": 39846,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Comparisons, Masks, and Boolean L"
},
{
"path": "notebooks/02.07-Fancy-Indexing.ipynb",
"chars": 63263,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Fancy Indexing\"\n ]\n },\n {\n "
},
{
"path": "notebooks/02.08-Sorting.ipynb",
"chars": 51876,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Sorting Arrays\"\n ]\n },\n {\n "
},
{
"path": "notebooks/02.09-Structured-Data-NumPy.ipynb",
"chars": 15988,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Structured Data: NumPy's Structur"
},
{
"path": "notebooks/03.00-Introduction-to-Pandas.ipynb",
"chars": 5197,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Data Manipulation with Pandas\"\n "
},
{
"path": "notebooks/03.01-Introducing-Pandas-Objects.ipynb",
"chars": 42503,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Introducing Pandas Objects\"\n ]\n"
},
{
"path": "notebooks/03.02-Data-Indexing-and-Selection.ipynb",
"chars": 42164,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Data Indexing and Selection\"\n ]"
},
{
"path": "notebooks/03.03-Operations-in-Pandas.ipynb",
"chars": 29932,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Operating on Data in Pandas\"\n ]"
},
{
"path": "notebooks/03.04-Missing-Values.ipynb",
"chars": 40848,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Handling Missing Data\"\n ]\n },\n"
},
{
"path": "notebooks/03.05-Hierarchical-Indexing.ipynb",
"chars": 71418,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Hierarchical Indexing\"\n ]\n },\n"
},
{
"path": "notebooks/03.06-Concat-And-Append.ipynb",
"chars": 53493,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Combining Datasets: concat and ap"
},
{
"path": "notebooks/03.07-Merge-and-Join.ipynb",
"chars": 119862,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Combining Datasets: merge and joi"
},
{
"path": "notebooks/03.08-Aggregation-and-Grouping.ipynb",
"chars": 73496,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Aggregation and Grouping\"\n ]\n "
},
{
"path": "notebooks/03.09-Pivot-Tables.ipynb",
"chars": 158742,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Pivot Tables\"\n ]\n },\n {\n \"c"
},
{
"path": "notebooks/03.10-Working-With-Strings.ipynb",
"chars": 38413,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Vectorized String Operations\"\n "
},
{
"path": "notebooks/03.11-Working-with-Time-Series.ipynb",
"chars": 520474,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Working with Time Series\"\n ]\n "
},
{
"path": "notebooks/03.12-Performance-Eval-and-Query.ipynb",
"chars": 31906,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# High-Performance Pandas: eval and"
},
{
"path": "notebooks/03.13-Further-Resources.ipynb",
"chars": 2569,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Further Resources\"\n ]\n },\n {\n"
},
{
"path": "notebooks/04.00-Introduction-To-Matplotlib.ipynb",
"chars": 109653,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Visualization with Matplotlib\"\n "
},
{
"path": "notebooks/04.01-Simple-Line-Plots.ipynb",
"chars": 346106,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Simple Line Plots\"\n ]\n },\n {\n"
},
{
"path": "notebooks/04.02-Simple-Scatter-Plots.ipynb",
"chars": 180797,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Simple Scatter Plots\"\n ]\n },\n "
},
{
"path": "notebooks/04.03-Errorbars.ipynb",
"chars": 46820,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Visualizing Uncertainties\"\n ]\n "
},
{
"path": "notebooks/04.04-Density-and-Contour-Plots.ipynb",
"chars": 450593,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Density and Contour Plots\"\n ]\n "
},
{
"path": "notebooks/04.05-Histograms-and-Binnings.ipynb",
"chars": 77163,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Histograms, Binnings, and Density"
},
{
"path": "notebooks/04.06-Customizing-Legends.ipynb",
"chars": 245918,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Customizing Plot Legends\"\n ]\n "
},
{
"path": "notebooks/04.07-Customizing-Colorbars.ipynb",
"chars": 353661,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Customizing Colorbars\"\n ]\n },\n"
},
{
"path": "notebooks/04.08-Multiple-Subplots.ipynb",
"chars": 155766,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Multiple Subplots\"\n ]\n },\n {\n"
},
{
"path": "notebooks/04.09-Text-and-Annotation.ipynb",
"chars": 232410,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Text and Annotation\"\n ]\n },\n "
},
{
"path": "notebooks/04.10-Customizing-Ticks.ipynb",
"chars": 236865,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Customizing Ticks\"\n ]\n },\n {\n"
},
{
"path": "notebooks/04.11-Settings-and-Stylesheets.ipynb",
"chars": 404688,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Customizing Matplotlib: Configura"
},
{
"path": "notebooks/04.12-Three-Dimensional-Plotting.ipynb",
"chars": 506655,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Three-Dimensional Plotting in Mat"
},
{
"path": "notebooks/04.14-Visualization-With-Seaborn.ipynb",
"chars": 860199,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Visualization with Seaborn\"\n ]\n"
},
{
"path": "notebooks/04.15-Further-Resources.ipynb",
"chars": 3539,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Further Resources\\n\",\n \"\\n\",\n "
},
{
"path": "notebooks/05.00-Machine-Learning.ipynb",
"chars": 2216,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Machine Learning\"\n ]\n },\n {\n "
},
{
"path": "notebooks/05.01-What-Is-Machine-Learning.ipynb",
"chars": 20977,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# What Is Machine Learning?\"\n ]\n "
},
{
"path": "notebooks/05.02-Introducing-Scikit-Learn.ipynb",
"chars": 411885,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Introducing Scikit-Learn\"\n ]\n "
},
{
"path": "notebooks/05.03-Hyperparameters-and-Model-Validation.ipynb",
"chars": 176938,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Hyperparameters and Model Validat"
},
{
"path": "notebooks/05.04-Feature-Engineering.ipynb",
"chars": 53853,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Feature Engineering\"\n ]\n },\n "
},
{
"path": "notebooks/05.05-Naive-Bayes.ipynb",
"chars": 229011,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# In Depth: Naive Bayes Classificat"
},
{
"path": "notebooks/05.06-Linear-Regression.ipynb",
"chars": 274088,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# In Depth: Linear Regression\"\n ]"
},
{
"path": "notebooks/05.07-Support-Vector-Machines.ipynb",
"chars": 794767,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# In Depth: Support Vector Machines"
},
{
"path": "notebooks/05.08-Random-Forests.ipynb",
"chars": 286362,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# In Depth: Decision Trees and Rand"
},
{
"path": "notebooks/05.09-Principal-Component-Analysis.ipynb",
"chars": 510352,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# In Depth: Principal Component An"
},
{
"path": "notebooks/05.10-Manifold-Learning.ipynb",
"chars": 973915,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# In Depth: Manifold Learning\"\n ]"
},
{
"path": "notebooks/05.11-K-Means.ipynb",
"chars": 1750177,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# In Depth: k-Means Clustering\"\n "
},
{
"path": "notebooks/05.12-Gaussian-Mixtures.ipynb",
"chars": 547543,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# In Depth: Gaussian Mixture Models"
},
{
"path": "notebooks/05.13-Kernel-Density-Estimation.ipynb",
"chars": 101588,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# In Depth: Kernel Density Estimati"
},
{
"path": "notebooks/05.14-Image-Features.ipynb",
"chars": 391040,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Application: A Face Detection Pip"
},
{
"path": "notebooks/05.15-Learning-More.ipynb",
"chars": 2949,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Further Machine Learning Resource"
},
{
"path": "notebooks/06.00-Figure-Code.ipynb",
"chars": 1681313,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Appendix: Figure Code\"\n ]\n },\n"
},
{
"path": "notebooks/Untitled.ipynb",
"chars": 72,
"preview": "{\n \"cells\": [],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 5\n}\n"
},
{
"path": "notebooks/data/births.csv",
"chars": 264648,
"preview": "year,month,day,gender,births\n1969,1,1,F,4046\n1969,1,1,M,4440\n1969,1,2,F,4454\n1969,1,2,M,4548\n1969,1,3,F,4548\n1969,1,3,M,"
},
{
"path": "notebooks/data/president_heights.csv",
"chars": 1028,
"preview": "order,name,height(cm)\n1,George Washington,189\n2,John Adams,170\n3,Thomas Jefferson,189\n4,James Madison,163\n5,James Monroe"
},
{
"path": "notebooks/data/state-abbrevs.csv",
"chars": 872,
"preview": "\"state\",\"abbreviation\"\n\"Alabama\",\"AL\"\n\"Alaska\",\"AK\"\n\"Arizona\",\"AZ\"\n\"Arkansas\",\"AR\"\n\"California\",\"CA\"\n\"Colorado\",\"CO\"\n\"Co"
},
{
"path": "notebooks/data/state-areas.csv",
"chars": 835,
"preview": "state,area (sq. mi)\nAlabama,52423\nAlaska,656425\nArizona,114006\nArkansas,53182\nCalifornia,163707\nColorado,104100\nConnecti"
},
{
"path": "notebooks/data/state-population.csv",
"chars": 57935,
"preview": "state/region,ages,year,population\nAL,under18,2012,1117489\nAL,total,2012,4817528\nAL,under18,2010,1130966\nAL,total,2010,47"
},
{
"path": "notebooks/helpers_05_08.py",
"chars": 2836,
"preview": "\nimport numpy as np\nimport matplotlib.pyplot as plt; plt.rcParams['figure.dpi'] = 600\nfrom sklearn.tree import DecisionT"
},
{
"path": "notebooks_v1/00.00-Preface.ipynb",
"chars": 13965,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/01.00-IPython-Beyond-Normal-Python.ipynb",
"chars": 8510,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/01.01-Help-And-Documentation.ipynb",
"chars": 15448,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/01.02-Shell-Keyboard-Shortcuts.ipynb",
"chars": 10620,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/01.03-Magic-Commands.ipynb",
"chars": 9933,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/01.04-Input-Output-History.ipynb",
"chars": 9125,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/01.05-IPython-And-Shell-Commands.ipynb",
"chars": 11576,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/01.06-Errors-and-Debugging.ipynb",
"chars": 21247,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/01.07-Timing-and-Profiling.ipynb",
"chars": 18944,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/01.08-More-IPython-Resources.ipynb",
"chars": 5627,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/02.00-Introduction-to-NumPy.ipynb",
"chars": 7580,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {\n \"deletable\": true,\n \"editable\": true\n },\n \"sou"
},
{
"path": "notebooks_v1/02.01-Understanding-Data-Types.ipynb",
"chars": 23919,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/02.02-The-Basics-Of-NumPy-Arrays.ipynb",
"chars": 33513,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/02.03-Computation-on-arrays-ufuncs.ipynb",
"chars": 32070,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/02.04-Computation-on-arrays-aggregates.ipynb",
"chars": 31320,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/02.05-Computation-on-arrays-broadcasting.ipynb",
"chars": 102235,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/02.06-Boolean-Arrays-and-Masks.ipynb",
"chars": 41170,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/02.07-Fancy-Indexing.ipynb",
"chars": 63004,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/02.08-Sorting.ipynb",
"chars": 62233,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/02.09-Structured-Data-NumPy.ipynb",
"chars": 17251,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/03.00-Introduction-to-Pandas.ipynb",
"chars": 7073,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/03.01-Introducing-Pandas-Objects.ipynb",
"chars": 40118,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/03.03-Operations-in-Pandas.ipynb",
"chars": 27593,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/03.04-Missing-Values.ipynb",
"chars": 37428,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/03.05-Hierarchical-Indexing.ipynb",
"chars": 76224,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {\n \"deletable\": true,\n \"editable\": true\n },\n \"sou"
},
{
"path": "notebooks_v1/03.06-Concat-And-Append.ipynb",
"chars": 47490,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/03.08-Aggregation-and-Grouping.ipynb",
"chars": 79265,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/03.09-Pivot-Tables.ipynb",
"chars": 185852,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/03.12-Performance-Eval-and-Query.ipynb",
"chars": 31156,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/03.13-Further-Resources.ipynb",
"chars": 5057,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {\n \"deletable\": true,\n \"editable\": true\n },\n \"sou"
},
{
"path": "notebooks_v1/04.00-Introduction-To-Matplotlib.ipynb",
"chars": 101171,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/04.03-Errorbars.ipynb",
"chars": 47770,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
},
{
"path": "notebooks_v1/04.04-Density-and-Contour-Plots.ipynb",
"chars": 497727,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<!--BOOK_INFORMATION-->\\n\",\n \"<i"
}
]
// ... and 75 more files (download for full content)
About this extraction
This page contains the full source code of the jakevdp/PythonDataScienceHandbook GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 187 files (33.8 MB), approximately 4.1M tokens, and a symbol index with 3 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.