Repository: satijalab/seurat-wrappers Branch: master Commit: ffaf74e30627 Files: 118 Total size: 51.6 MB Directory structure: gitextract_4pdtb5c6/ ├── .Rbuildignore ├── .github/ │ ├── no-response.yml │ └── workflows/ │ └── test-vignettes.yml ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── R/ │ ├── alevin.R │ ├── alra.R │ ├── banksy.R │ ├── cellbrowser.R │ ├── cogaps.R │ ├── conos.R │ ├── fast_mnn.R │ ├── fast_mnn_v5.R │ ├── glmpca.R │ ├── internal.R │ ├── liger.R │ ├── miqc.R │ ├── monocle3.R │ ├── pacmap.R │ ├── presto.R │ ├── scVI.R │ ├── tricycle.R │ └── velocity.R ├── README.md ├── docs/ │ ├── README.md │ ├── alevin.Rmd │ ├── alevin.html │ ├── alevin.md │ ├── alra.Rmd │ ├── alra.html │ ├── alra.md │ ├── banksy.Rmd │ ├── banksy.md │ ├── cellbrowser.Rmd │ ├── cellbrowser.html │ ├── cellbrowser.md │ ├── cipr.Rmd │ ├── cipr.html │ ├── cipr.md │ ├── cogaps.html │ ├── cogaps.md │ ├── cogaps.rmd │ ├── conos.Rmd │ ├── conos.html │ ├── conos.md │ ├── fast_mnn.Rmd │ ├── fast_mnn.html │ ├── fast_mnn.md │ ├── glmpca.Rmd │ ├── glmpca.html │ ├── glmpca.md │ ├── harmony.Rmd │ ├── harmony.html │ ├── harmony.md │ ├── liger.Rmd │ ├── liger.html │ ├── liger.md │ ├── miQC.Rmd │ ├── miQC.html │ ├── miQC.md │ ├── monocle3.Rmd │ ├── monocle3.html │ ├── monocle3.md │ ├── nebulosa.Rmd │ ├── nebulosa.html │ ├── nebulosa.md │ ├── pacmap.Rmd │ ├── pacmap.html │ ├── pacmap.md │ ├── presto.html │ ├── presto.md │ ├── presto.rmd │ ├── schex.Rmd │ ├── schex.html │ ├── schex.md │ ├── scvelo.Rmd │ ├── scvelo.html │ ├── scvelo.md │ ├── tricycle.Rmd │ ├── tricycle.html │ ├── tricycle.md │ ├── velocity.Rmd │ ├── velocity.html │ └── velocity.md ├── man/ │ ├── ALRAChooseKPlot.Rd │ ├── CellBrowser.Rd │ ├── FastMNNIntegration.Rd │ ├── LearnGraph.Rd │ ├── PlotMiQC.Rd │ ├── ReadAlevin.Rd │ ├── ReadVelocity.Rd │ ├── RunALRA.Rd │ ├── RunBanksy.Rd │ ├── RunCoGAPS.Rd │ ├── RunFastMNN.Rd │ ├── RunGLMPCA.Rd │ ├── RunMiQC.Rd │ ├── RunOptimizeALS.Rd │ ├── RunPaCMAP.Rd │ ├── RunPresto.Rd │ ├── RunPrestoAll.Rd │ ├── RunQuantileAlignSNF.Rd │ ├── RunQuantileNorm.Rd │ ├── RunSNF.Rd │ ├── RunVelocity.Rd │ ├── Runtricycle.Rd │ ├── SeuratWrappers-package.Rd │ ├── StopCellbrowser.Rd │ ├── VeloPlot.Rd │ ├── as.Seurat.extras.Rd │ ├── as.cell_data_set.Rd │ ├── findMatrix.Rd │ ├── scVIIntegration.Rd │ └── writeSparseTsvChunks.Rd ├── seurat-wrappers.Rproj └── test-vignettes.sh ================================================ FILE CONTENTS ================================================ ================================================ FILE: .Rbuildignore ================================================ ^.*\.Rproj$ ^\.Rproj\.user$ ^docs$ ================================================ FILE: .github/no-response.yml ================================================ # Configuration for probot-no-response - https://github.com/probot/no-response # Number of days of inactivity before an Issue is closed for lack of response daysUntilClose: 14 # Label requiring a response responseRequiredLabel: more-information-needed # Comment to post when closing an Issue for lack of response. Set to `false` to disable closeComment: > This issue has been automatically closed because there has been no response to our request for more information from the original author. With only the information that is currently in the issue, we don't have enough information to take action. Please reach out if you have or find the answers we need so that we can investigate further. ================================================ FILE: .github/workflows/test-vignettes.yml ================================================ on: [push, pull_request] jobs: test-vignettes: name: Test any changes to vignettes runs-on: ubuntu-latest container: image: satijalab/seurat-wrappers:latest env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - uses: actions/checkout@v2 - name: Set R repo run: echo 'options(repos = "https://cloud.r-project.org")' > ~/.Rprofile - name: Install SeuratWrappers run: devtools::install(upgrade = FALSE) shell: Rscript {0} - name: Test Vignettes run: bash test-vignettes.sh - name: Upload files uses: actions/upload-artifact@master with: name: test-build path: test-build ================================================ FILE: .gitignore ================================================ .Rproj.user .Rhistory .RData .Ruserdata *_cache *.h5Seurat *.h5seurat *.h5ad ================================================ FILE: DESCRIPTION ================================================ Package: SeuratWrappers Title: Community-Provided Methods and Extensions for the Seurat Object Version: 0.4.0 Date: 2024-11-20 Authors@R: c( person(given = 'Andrew', family = 'Butler', email = 'abutler@nygenome.org', role = 'aut', comment = c(ORCID = '0000-0003-3608-0463')), person(given = "Saket", family = "Choudhary", email = "schoudhary@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0001-5202-7633")), person(given = 'David', family = 'Collins', email = 'dcollins@nygenome.org', role = 'ctb', comment = c(ORCID = '0000-0001-9243-7821')), person(given = "Yuhan", family = "Hao", email = "yhao@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0002-1810-0822")), person(given = "Austin", family = "Hartman", email = "ahartman@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0001-7278-1852")), person(given = 'Paul', family = 'Hoffman', email = 'nygcSatijalab@nygenome.org', role = c('aut', 'cre'), comment = c(ORCID = '0000-0002-7693-8957')), person(given = "Gesmira", family = "Molla", email = 'gmolla@nygenome.org', role = 'ctb', comment = c(ORCID = '0000-0002-8628-5056')), person(given = 'Rahul', family = 'Satija', email = 'rsatija@nygenome.org', role = 'aut', comment = c(ORCID = '0000-0001-9448-8833')), person(given = 'Tim', family = 'Stuart', email = 'tstuart@nygenome.org', role = 'aut', comment = c(ORCID = '0000-0002-3044-0897')) ) Description: SeuratWrappers is a collection of community-provided methods and extensions for Seurat, curated by the Satija Lab at NYGC. These methods comprise functionality not presently found in Seurat, and are able to be updated much more frequently. License: GPL-3 | file LICENSE Remotes: welch-lab/liger, hms-dbmi/conos, immunogenomics/harmony, immunogenomics/presto, satijalab/seurat-data, velocyto-team/velocyto.R, SaskiaFreytag/schex@031320d, cole-trapnell-lab/monocle3, mojaveazure/seurat-disk, powellgenomicslab/Nebulosa, atakanekiz/CIPR-Package, prabhakarlab/Banksy Depends: R (>= 3.5.0) biocViews: Imports: BiocManager, cowplot, ggplot2, igraph, Matrix, methods, remotes, rsvd, Seurat (>= 5.0.0), stats, utils, rlang Collate: 'internal.R' 'alevin.R' 'alra.R' 'banksy.R' 'cellbrowser.R' 'cogaps.R' 'conos.R' 'fast_mnn.R' 'fast_mnn_v5.R' 'glmpca.R' 'liger.R' 'miqc.R' 'monocle3.R' 'pacmap.R' 'presto.R' 'scVI.R' 'tricycle.R' 'velocity.R' Encoding: UTF-8 LazyData: true RoxygenNote: 7.3.2 Suggests: cipr, conos, rliger (>= 0.5.0), harmony, batchelor, SeuratData, SeuratDisk, velocyto.R, schex, tximport, fishpond, monocle3, CoGAPS, glmpca, Nebulosa, presto, flexmix, tricycle, Banksy ================================================ FILE: LICENSE ================================================ GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . ================================================ FILE: NAMESPACE ================================================ # Generated by roxygen2: do not edit by hand S3method(RunALRA,Seurat) S3method(RunALRA,default) S3method(RunPaCMAP,Seurat) S3method(RunPaCMAP,default) S3method(as.Seurat,Conos) S3method(as.Seurat,cell_data_set) S3method(as.Seurat,list) S3method(as.cell_data_set,Seurat) export(ALRAChooseKPlot) export(ExportToCellbrowser) export(FastMNNIntegration) export(PlotMiQC) export(ReadAlevin) export(ReadVelocity) export(RunALRA) export(RunBanksy) export(RunCoGAPS) export(RunFastMNN) export(RunGLMPCA) export(RunMiQC) export(RunOptimizeALS) export(RunPaCMAP) export(RunPresto) export(RunPrestoAll) export(RunQuantileAlignSNF) export(RunQuantileNorm) export(RunSNF) export(RunVelocity) export(Runtricycle) export(StopCellbrowser) export(as.cell_data_set) export(scVIIntegration) importFrom(BiocManager,install) importFrom(Matrix,Matrix) importFrom(Matrix,writeMM) importFrom(R.utils,gzip) importFrom(Seurat,"DefaultAssay<-") importFrom(Seurat,"Idents<-") importFrom(Seurat,"Key<-") importFrom(Seurat,"Loadings<-") importFrom(Seurat,"Misc<-") importFrom(Seurat,"Tool<-") importFrom(Seurat,"VariableFeatures<-") importFrom(Seurat,CombinePlots) importFrom(Seurat,CreateAssayObject) importFrom(Seurat,CreateDimReducObject) importFrom(Seurat,CreateSeuratObject) importFrom(Seurat,DefaultAssay) importFrom(Seurat,Embeddings) importFrom(Seurat,FetchData) importFrom(Seurat,FindAllMarkers) importFrom(Seurat,FindMarkers) importFrom(Seurat,GetAssayData) importFrom(Seurat,Idents) importFrom(Seurat,IsGlobal) importFrom(Seurat,Loadings) importFrom(Seurat,LogSeuratCommand) importFrom(Seurat,Project) importFrom(Seurat,Reductions) importFrom(Seurat,SelectIntegrationFeatures) importFrom(Seurat,SplitObject) importFrom(Seurat,Stdev) importFrom(Seurat,Tool) importFrom(Seurat,VariableFeatures) importFrom(Seurat,as.Graph) importFrom(Seurat,as.Seurat) importFrom(Seurat,as.SingleCellExperiment) importFrom(cowplot,theme_cowplot) importFrom(data.table,data.table) importFrom(data.table,fwrite) importFrom(data.table,setDTthreads) importFrom(ggplot2,aes_string) importFrom(ggplot2,geom_line) importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_vline) importFrom(ggplot2,ggplot) importFrom(ggplot2,labs) importFrom(ggplot2,scale_x_continuous) importFrom(igraph,get.adjacency) importFrom(methods,"slot<-") importFrom(methods,as) importFrom(methods,new) importFrom(methods,slot) importFrom(remotes,install_github) importFrom(reticulate,import) importFrom(reticulate,py_module_available) importFrom(rlang,"%||%") importFrom(rlang,check_installed) importFrom(rlang,duplicate) importFrom(rsvd,rsvd) importFrom(stats,as.dist) importFrom(stats,pnorm) importFrom(stats,quantile) importFrom(stats,sd) importFrom(stats,setNames) importFrom(stats,wilcox.test) importFrom(tools,file_ext) importFrom(utils,assignInNamespace) importFrom(utils,browseURL) importFrom(utils,capture.output) importFrom(utils,install.packages) importFrom(utils,menu) importFrom(utils,packageVersion) importFrom(utils,setTxtProgressBar) importFrom(utils,txtProgressBar) importFrom(utils,write.table) ================================================ FILE: R/alevin.R ================================================ #' @include internal.R #' NULL #' Load alevin quantification data #' #' A wrapper around tximport to create a \code{SeuratObject} #' from alevin quantification data. #' #' @param file path to \code{quants_mat.gz} file within #' alevin directory #' @param getMeta logical, option to use \code{tximeta} to #' programmatically obtain gene range information, default #' is FALSE. Ranges are stored in \code{chr}, \code{start}, #' and \code{end} in the \code{meta.features} slot. #' @param meanAndVariance logical, should mean and variance #' of counts be returned in \code{counts} and \code{data} #' slots, respectively #' @param ... extra arguments passed to \code{tximport}, #' for example, #' \code{alevinArgs=list(filterBarcodes=TRUE)}. #' #' @return returns a Seurat object with alevin counts #' @seealso \code{\link[alevin]{alevin}} #' @author Avi Srivastava #' @references Srivastava, Avi, et al. "Alevin efficiently #' estimates accurate gene abundances from dscRNA-seq data." #' Genome biology 20.1 (2019): 65. #' @export ReadAlevin <- function(file, getMeta=FALSE, meanAndVariance=FALSE, ...) { CheckPackage(package = 'tximport', repository = 'bioconductor') CheckPackage(package = 'fishpond', repository = 'bioconductor') hasTximeta <- requireNamespace("tximeta", quietly=TRUE) metaSuccess <- FALSE if (getMeta) { if (!hasTximeta) stop("tximeta is not installed, use BiocManager::install()") se <- tximeta::tximeta(file, type="alevin", ...) metaSuccess <- !is.null(SummarizedExperiment::rowRanges(se)) if (meanAndVariance & all(c("mean","variance") %in% SummarizedExperiment::assayNames(se))) { txi <- list(mean=SummarizedExperiment::assays(se)[["mean"]], variance=SummarizedExperiment::assays(se)[["variance"]]) } else { txi <- list(counts=SummarizedExperiment::assays(se)[["counts"]]) } } else { txi <- tximport::tximport(file, type="alevin", ...) } if (meanAndVariance) { if (!all(c("mean","variance") %in% names(txi))) stop("mean and variance not present in alevin directory") obj <- CreateSeuratObject(counts=txi$mean) obj <- Seurat::SetAssayData(obj, "data", txi$variance) } else { obj <- CreateSeuratObject(counts=txi$counts) } if (metaSuccess) { r <- SummarizedExperiment::rowRanges(se) obj[["RNA"]][["chr"]] <- as.character(GenomicRanges::seqnames(r)) obj[["RNA"]][["start"]] <- GenomicRanges::start(r) obj[["RNA"]][["end"]] <- GenomicRanges::end(r) } return(obj) } ================================================ FILE: R/alra.R ================================================ #' Run Adaptively-thresholded Low Rank Approximation (ALRA) #' #' Runs ALRA, a method for imputation of dropped out values in scRNA-seq data. #' Computes the k-rank approximation to A_norm and adjusts it according to the #' error distribution learned from the negative values. Described in #' Linderman, G. C., Zhao, J., Kluger, Y. (2018). "Zero-preserving imputation #' of scRNA-seq data using low rank approximation." (bioRxiv:138677) #' #' @param object An object #' @param k The rank of the rank-k approximation. Set to NULL for automated choice of k. #' @param q The number of additional power iterations in randomized SVD when #' computing rank k approximation. By default, q=10. #' @param quantile.prob The quantile probability to use when calculating threshold. #' By default, quantile.prob = 0.001. #' @param use.mkl Use the Intel MKL based implementation of SVD. Needs to be #' installed from https://github.com/KlugerLab/rpca-mkl. #' @param mkl.seed Only relevant if use.mkl=T. Set the seed for the random #' generator for the Intel MKL implementation of SVD. Any number <0 will #' use the current timestamp. If use.mkl=F, set the seed using #' set.seed() function as usual. #' @param assay Assay to use #' @param slot slot to use #' @param setDefaultAssay If TRUE, will set imputed results as default Assay #' @param genes.use genes to impute #' @param K Number of singular values to compute when choosing k. Must be less #' than the smallest dimension of the matrix. Default 100 or smallest dimension. #' @param p.val.th The threshold for ''significance'' when choosing k. Default 1e-10. #' @param noise.start Index for which all smaller singular values are considered noise. #' Default K - 20. #' @param q.k Number of additional power iterations when choosing k. Default 2. #' @param k.only If TRUE, only computes optimal k WITHOUT performing ALRA #' #' @param ... Arguments passed to other methods #' #' @importFrom rsvd rsvd #' @importFrom Matrix Matrix #' @importFrom stats pnorm sd setNames quantile #' @importFrom Seurat DefaultAssay Tool GetAssayData Tool<- CreateAssayObject #' DefaultAssay<- #' #' #' @rdname RunALRA #' @export RunALRA #' #' @author Jun Zhao, George Linderman #' @references Linderman, G. C., Zhao, J., Kluger, Y. (2018). "Zero-preserving imputation #' of scRNA-seq data using low rank approximation." (bioRxiv:138677) #' @seealso \code{\link{ALRAChooseKPlot}} #' #' @examples #' \dontrun{ #' pbmc_small #' # Example 1: Simple usage, with automatic choice of k. #' pbmc_small_alra <- RunALRA(object = pbmc_small) #' # Example 2: Visualize choice of k, then run ALRA #' # First, choose K #' pbmc_small_alra <- RunALRA(pbmc_small, k.only=TRUE) #' # Plot the spectrum, spacings, and p-values which are used to choose k #' ggouts <- ALRAChooseKPlot(pbmc_small_alra) #' do.call(gridExtra::grid.arrange, c(ggouts, nrow=1)) #' # Run ALRA with the chosen k #' pbmc_small_alra <- RunALRA(pbmc_small_alra) #' } #' RunALRA <- function(object, ...) { UseMethod(generic = 'RunALRA', object = object) } #' @rdname RunALRA #' @export #' RunALRA.default <- function( object, k = NULL, q = 10, quantile.prob = 0.001, use.mkl = FALSE, mkl.seed = -1, ... ) { A.norm <- t(x = as.matrix(x = object)) message("Identifying non-zero values") originally.nonzero <- A.norm > 0 message("Computing Randomized SVD") if (use.mkl) { CheckPackage(package = 'KlugerLab/rpca-mkl/fastRPCA', repository = 'github') fastDecomp.noc <- setNames( object = vector(mode = "list", length = 3), nm = c("u", "d", "v") ) fastPCAOut <- fastRPCA::fastPCA( inputMatrix = A.norm, k = k, its = q, l = (k + 10), seed = mkl.seed ) fastDecomp.noc$u <- fastPCAOut$U fastDecomp.noc$v <- fastPCAOut$V fastDecomp.noc$d <- diag(x = fastPCAOut$S) } else { fastDecomp.noc <- rsvd(A = A.norm, k = k, q = q) } A.norm.rank.k <- fastDecomp.noc$u[, 1:k] %*% diag(x = fastDecomp.noc$d[1:k]) %*% t(x = fastDecomp.noc$v[,1:k]) message(sprintf("Find the %f quantile of each gene", quantile.prob)) # A.norm.rank.k.mins <- abs(x = apply(X = A.norm.rank.k, MARGIN = 2, FUN = min)) A.norm.rank.k.mins <- abs(x = apply( X = A.norm.rank.k, MARGIN = 2, FUN = function(x) { return(quantile(x = x, probs = quantile.prob)) } )) message("Thresholding by the most negative value of each gene") A.norm.rank.k.cor <- replace( x = A.norm.rank.k, list = A.norm.rank.k <= A.norm.rank.k.mins[col(A.norm.rank.k)], values = 0 ) sd.nonzero <- function(x) { return(sd(x = x[!x == 0])) } sigma.1 <- apply(X = A.norm.rank.k.cor, MARGIN = 2, FUN = sd.nonzero) sigma.2 <- apply(X = A.norm, MARGIN = 2, FUN = sd.nonzero) mu.1 <- colSums(x = A.norm.rank.k.cor) / colSums(x = !!A.norm.rank.k.cor) mu.2 <- colSums(x = A.norm) / colSums(x = !!A.norm) toscale <- !is.na(sigma.1) & !is.na(sigma.2) & !(sigma.1 == 0 & sigma.2 == 0) & !(sigma.1 == 0) message(sprintf(fmt = "Scaling all except for %d columns", sum(!toscale))) sigma.1.2 <- sigma.2 / sigma.1 toadd <- -1 * mu.1 * sigma.2 / sigma.1 + mu.2 A.norm.rank.k.temp <- A.norm.rank.k.cor[, toscale] A.norm.rank.k.temp <- sweep( x = A.norm.rank.k.temp, MARGIN = 2, STATS = sigma.1.2[toscale], FUN = "*" ) A.norm.rank.k.temp <- sweep( x = A.norm.rank.k.temp, MARGIN = 2, STATS = toadd[toscale], FUN = "+" ) A.norm.rank.k.cor.sc <- A.norm.rank.k.cor A.norm.rank.k.cor.sc[, toscale] <- A.norm.rank.k.temp A.norm.rank.k.cor.sc[A.norm.rank.k.cor == 0] <- 0 lt0 <- A.norm.rank.k.cor.sc < 0 A.norm.rank.k.cor.sc[lt0] <- 0 message(sprintf( fmt = "%.2f%% of the values became negative in the scaling process and were set to zero", 100 * sum(lt0) / prod(dim(x = A.norm)) )) A.norm.rank.k.cor.sc[originally.nonzero & A.norm.rank.k.cor.sc == 0] <- A.norm[originally.nonzero & A.norm.rank.k.cor.sc == 0] colnames(x = A.norm.rank.k) <- colnames(x = A.norm.rank.k.cor.sc) <- colnames(x = A.norm.rank.k.cor) <- colnames(x = A.norm) original.nz <- sum(A.norm > 0) / prod(dim(x = A.norm)) completed.nz <- sum(A.norm.rank.k.cor.sc > 0) / prod(dim(x = A.norm)) message(sprintf( fmt = "The matrix went from %.2f%% nonzero to %.2f%% nonzero", 100 * original.nz, 100 * completed.nz )) return(A.norm.rank.k.cor.sc) } #' @rdname RunALRA #' @export #' @method RunALRA Seurat #' RunALRA.Seurat <- function( object, k = NULL, q = 10, quantile.prob = 0.001, use.mkl = FALSE, mkl.seed=-1, assay = NULL, slot = "data", setDefaultAssay = TRUE, genes.use = NULL, K = NULL, thresh=6, noise.start = NULL, q.k = 2, k.only = FALSE, ... ) { if (!is.null(x = k) && k.only) { warning("Stop: k is already given, set k.only = FALSE or k = NULL") } genes.use <- genes.use %||% rownames(x = object) assay <- assay %||% DefaultAssay(object = object) alra.previous <- Tool(object = object, slot = 'RunALRA') alra.info <- list() # Check if k is already stored if (is.null(x = k) & !is.null(alra.previous[["k"]])) { k <- alra.previous[["k"]] message("Using previously computed value of k") } data.used <- GetAssayData(object = object, assay = assay, layer = slot)[genes.use,] # Choose k with heuristics if k is not given if (is.null(x = k)) { # set K based on data dimension if (is.null(x = K)) { K <- 100 if (K > min(dim(x = data.used))) { K <- min(dim(x = data.used)) warning("For best performance, we recommend using ALRA on expression matrices larger than 100 by 100") } } if (K > min(dim(x = data.used))) { stop("For an m by n data, K must be smaller than the min(m,n)") } # set noise.start based on K if (is.null(x = noise.start)) { noise.start <- K - 20 if (noise.start <= 0) { noise.start <- max(K - 5, 1) } } if (noise.start > K - 5) { stop("There need to be at least 5 singular values considered noise") } noise.svals <- noise.start:K if (use.mkl) { CheckPackage(package = 'KlugerLab/rpca-mkl/fastRPCA', repository = 'github') L <- min(K + 10, min(dim(x = data.used))) rsvd.out <- setNames( object = vector(mode = "list", length = 3), nm = c("u", "d", "v") ) fastPCAOut <- fastRPCA::fastPCA( inputMatrix = as.matrix(x = data.used), k = K, its = q.k, l = L, seed = mkl.seed ) rsvd.out$u <- fastPCAOut$U rsvd.out$v <- fastPCAOut$V rsvd.out$d <- diag(x = fastPCAOut$S) } else { rsvd.out <- rsvd(A = t(x = as.matrix(x = data.used)), k = K, q = q.k) } diffs <- rsvd.out$d[1:(length(x = rsvd.out$d) - 1)] - rsvd.out$d[2:length(x = rsvd.out$d)] mu <- mean(x = diffs[noise.svals - 1]) sigma <- sd(x = diffs[noise.svals - 1]) num_of_sds <- (diffs - mu) / sigma k <- max(which(x = num_of_sds > thresh)) alra.info[["d"]] <- rsvd.out$d alra.info[["k"]] <- k alra.info[["diffs"]] <- diffs Tool(object = object) <- alra.info } if (k.only) { message("Chose rank k = ", k, ", WITHOUT performing ALRA") return(object) } message("Rank k = ", k) # Perform ALRA on data.used output.alra <- RunALRA( object = data.used, k = k, q = q, quantile.prob = quantile.prob, use.mkl = use.mkl, mkl.seed = mkl.seed ) # Save ALRA data in object@assay data.alra <- Matrix(data = t(x = output.alra), sparse = TRUE) rownames(x = data.alra) <- genes.use colnames(x = data.alra) <- colnames(x = object) assay.alra <- CreateAssayObject(data = data.alra) object[["alra"]] <- assay.alra if (setDefaultAssay) { message("Setting default assay as alra") DefaultAssay(object = object) <- "alra" } return(object) } #' ALRA Approximate Rank Selection Plot #' #' Plots the results of the approximate rank selection process for ALRA. #' #' #' @param object Seurat object #' @param start Index to start plotting singular value spacings from. #' The transition from "signal" to "noise" in the is hard to see because the #' first singular value spacings are so large. Nicer visualizations result from #' skipping the first few. If set to 0 (default) starts from k/2. #' @param combine Combine plots into a single gg object; note that if TRUE, #' themeing will not work when plotting multiple features #' #' @return A list of 3 ggplot objects splotting the singular values, the #' spacings of the singular values, and the p-values of the singular values. #' #' @author Jun Zhao, George Linderman #' @seealso \code{\link{RunALRA}} #' #' @importFrom Seurat CombinePlots #' @importFrom cowplot theme_cowplot #' @importFrom ggplot2 ggplot aes_string geom_point geom_line #' geom_vline scale_x_continuous labs #' @export #' ALRAChooseKPlot <- function(object, start = 0, combine = TRUE) { alra.data <- Tool(object = object, slot = 'RunALRA') if (is.null(x = alra.data)) { stop('RunALRA should be run prior to using this function.') } d <- alra.data[["d"]] diffs <- alra.data[["diffs"]] pvals <- alra.data[["pvals"]] k <- alra.data[["k"]] if (start == 0) { start <- floor(x = k / 2) } if (start > k) { stop("Plots should include k (i.e. starting.from should be less than k)") } breaks <- seq(from = 10, to = length(x = d), by = 10) ggdata <- data.frame(x = 1:length(x = d), y = d) gg1 <- ggplot(data = ggdata, mapping = aes_string(x = 'x', y = 'y')) + geom_point(size = 1) + geom_line(size = 0.5) + geom_vline(xintercept = k) + theme_cowplot() + scale_x_continuous(breaks = breaks) + labs(x = NULL, y = 's_i', title = 'Singular values') ggdata <- data.frame(x = 2:length(x = d), y = diffs)[-(1:(start - 1)), ] gg2 <- ggplot(data = ggdata, mapping = aes_string(x = 'x', y = 'y')) + geom_point(size = 1) + geom_line(size = 0.5) + geom_vline(xintercept = k + 1) + theme_cowplot() + scale_x_continuous(breaks = breaks) + labs(x = NULL, y = 's_{i} - s_{i-1}', title = 'Singular value spacings') ggdata <- data.frame(x = 2:length(x = d), y = pvals) gg3 <- ggplot(data = ggdata, mapping = aes_string(x = 'x', y = 'y')) + geom_point(size = 1) + geom_vline(xintercept = k + 1) + theme_cowplot() + scale_x_continuous(breaks = breaks) + labs(x = NULL, y = 'p.val', title = 'Singular value spacing p-values') plots <- list(spectrum = gg1, spacings = gg2, pvals = gg3) if (combine) { plots <- CombinePlots(plots = plots) } return(plots) } ================================================ FILE: R/banksy.R ================================================ #' @include internal.R #' NULL #' Run Banksy on a Seurat Object #' #' @param object A Seurat object #' @param lambda (numeric) Spatial weight parameter #' @param assay (character) Assay in Seurat object to use #' @param slot (character) Slot in Seurat assay to use #' @param use_agf (boolean) Whether to use the AGF #' @param dimx (character) Column name of spatial x dimension (must be in metadata) #' @param dimy (character) Column name of spatial y dimension (must be in metadata) #' @param dimz (character) Column name of spatial z dimension (must be in metadata) #' @param ndim (integer) Number of spatial dimensions to extract #' @param features (character) Features to compute. Can be 'all', 'variable' or #' a vector of feature names #' @param group (character) Column name of a grouping variable (must be in metadata) #' @param split.scale (boolean) Whether to separate scaling by group #' @param k_geom (numeric) kNN parameter - number of neighbors to use #' @param n (numeric) kNN_rn parameter - exponent of radius #' @param sigma (numeric) rNN parameter - standard deviation of Gaussian kernel #' @param alpha (numeric) rNN parameter - determines radius used #' @param k_spatial (numeric) rNN parameter - number of neighbors to use #' @param spatial_mode (character) Kernel for neighborhood computation #' \itemize{ #' \item{kNN_median: k-nearest neighbors with median-scaled Gaussian kernel} #' \item{kNN_r: k-nearest neighbors with $1/r$ kernel} #' \item{kNN_rn: k-nearest neighbors with $1/r^n$ kernel} #' \item{kNN_rank: k-nearest neighbors with rank Gaussian kernel} #' \item{kNN_unif: k-nearest neighbors wth uniform kernel} #' \item{rNN_gauss: radial nearest neighbors with Gaussian kernel} #' } #' @param assay_name (character) Name for Banksy assay in Seurat object #' @param M (numeric) Advanced usage. Highest azimuthal harmonic #' @param verbose (boolean) Print messages #' #' @return A Seurat object with new assay holding a Banksy matrix #' #' @seealso \code{\link[Banksy]{ComputeBanksy}} #' #' @author Joseph Lee, Vipul Singhal #' #' @references Vipul Singhal, Nigel Chou et. al. BANKSY: A Spatial Omics #' Algorithm that Unifies Cell Type Clustering and Tissue Domain Segmentation #' #' @export RunBanksy <- function(object, lambda, assay='RNA', slot='data', use_agf=FALSE, dimx=NULL, dimy=NULL, dimz=NULL, ndim=2, features='variable', group=NULL, split.scale=TRUE, k_geom=15, n=2, sigma=1.5, alpha=0.05, k_spatial=10, spatial_mode='kNN_median', assay_name='BANKSY', M=NULL, verbose=TRUE) { # Check packages SeuratWrappers:::CheckPackage(package = 'data.table', repository = 'CRAN') SeuratWrappers:::CheckPackage(package = 'Matrix', repository = 'CRAN') SeuratWrappers:::CheckPackage(package = 'Banksy', repository = 'github') # Check lambda param if (lambda < 0 || lambda > 1) stop('Lambda must be between 0 and 1') # Get data data_own <- get_data(object, assay, slot, features, verbose) # Get locs locs <- get_locs(object, dimx, dimy, dimz, ndim, data_own, group, verbose) if (!is.null(group)) { object <- AddMetaData( object, metadata = locs, col.name = paste0('staggered_', colnames(locs))) } # Compute neighbor matrix knn_list <- lapply(k_geom, function(kg) { Banksy:::computeNeighbors(locs, spatial_mode = spatial_mode, k_geom = kg, n = n, sigma=sigma, alpha=alpha, k_spatial=k_spatial, verbose=verbose) }) # Create Banksy matrix M <- seq(0, max(Banksy:::getM(use_agf, M))) # Compute harmonics center <- rep(TRUE, length(M)) # Only center higher harmonics center[1] <- FALSE har <- Map(function(knn_df, M, center) { x <- Banksy:::computeHarmonics(data_own, knn_df, M, center, verbose) rownames(x) <- paste0(rownames(x), '.m', M) x }, knn_list, M, center) # Scale by lambdas lambdas <- Banksy:::getLambdas(lambda, n_harmonics = length(har)) # Merge with own expression if (verbose) message('Creating Banksy matrix') data_banksy <- c(list(data_own), har) if (verbose) message('Scaling BANKSY matrix. Do not call ScaleData on assay ', assay_name) data_scaled <- lapply(data_banksy, fast_scaler, object, group, split.scale, verbose) # Multiple by lambdas data_banksy <- Map(function(lam, mat) lam * mat, lambdas, data_banksy) data_scaled <- Map(function(lam, mat) lam * mat, lambdas, data_scaled) # Rbind data_banksy <- do.call(rbind, data_banksy) data_scaled <- do.call(rbind, data_scaled) # Create an assay object if (grepl(pattern = 'counts', x = slot)) { banksy_assay <- Seurat::CreateAssayObject(counts = data_banksy) } else { banksy_assay <- Seurat::CreateAssayObject(data = data_banksy) } # Add assay to Seurat object and set as default if (verbose) message('Setting default assay to ', assay_name) object[[assay_name]] <- banksy_assay DefaultAssay(object) <- assay_name object <- SetAssayData(object, layer = 'scale.data', new.data = data_scaled, assay = assay_name) # Log commands object <- Seurat::LogSeuratCommand(object = object) return(object) } # Get own expression matrix from Seurat object get_data <- function(object, assay, slot, features, verbose) { # Fetch data from Seurat if (verbose) message('Fetching data from slot ', slot,' from assay ', assay) data_own <- Seurat::GetAssayData(object = object, assay = assay, layer = slot) # Feature subset if (features[1] != 'all') { if (verbose) message('Subsetting by features') if (features[1] == 'variable') { feat <- Seurat::VariableFeatures(object) if (length(feat) == 0) { warning('No variable features found. Running Seurat::FindVariableFeatures') object <- Seurat::FindVariableFeatures(object) feat <- Seurat::VariableFeatures(object) } } else { feat <- features[which(rownames(object) %in% features)] if (length(feat) == 0) stop('None of the specified features found. Check if features in Seurat object') } data_own <- data_own[feat,,drop=FALSE] } data_own <- as.matrix(x = data_own) return(data_own) } # Get locations from Seurat object get_locs <- function(object, dimx, dimy, dimz, ndim, data_own, group, verbose) { if (!is.null(dimx) & !is.null(dimy)) { # Extract locations from metadata locs <- data.frame( sdimx = unlist(object[[dimx]]), sdimy = unlist(object[[dimy]]) ) rownames(locs) <- colnames(object) # Add z-dim if present if (!is.null(dimz)) locs$sdimz = object[[dimz]] # Check locations obj_samples <- colnames(data_own) locs_samples <- rownames(locs) if (any(is.na(match(obj_samples, locs_samples)))) { na_id <- which(is.na(match(obj_samples, locs_samples))) warning('No centroids found for samples: ', paste(obj_samples[na_id], collapse = ', '), '. Dropping samples.') data_own <- data_own[, -na_id, drop = FALSE] } locs <- locs[match(obj_samples, locs_samples),,drop=FALSE] } else { # Extract locations with Seurat accessor locs <- Seurat::GetTissueCoordinates(object)[,seq_len(ndim)] } dim_names <- paste0('sdim', c('x','y','z')) colnames(locs) <- dim_names[seq_len(ncol(locs))] if (!is.null(group)) { # Stagger locations by group if (verbose) message('Staggering locations by ', group) locs[,1] = locs[,1] + abs(min(locs[,1])) max_x = max(locs[,1]) * 2 n_groups = length(unique(unlist(object[[group]]))) shift = seq(from = 0, length.out = n_groups, by = max_x) locs[,1] = locs[,1] + rep(shift, table(object[[group]])) } return(locs) } # Scaling fast_scaler = function(data, object, group, split.scale, verbose) { # Split scaling by group if (!is.null(group) & split.scale) { groups = unlist(object[[group]]) ugroups = unique(groups) for (curr_group in ugroups) { if (verbose) message('Scaling group: ', curr_group) curr_group_id <- which(curr_group == groups) data[, curr_group_id] <- Seurat:::FastRowScale( data[, curr_group_id]) } } else { data <- Seurat::FastRowScale(data) } data } ================================================ FILE: R/cellbrowser.R ================================================ # Build a UCSC cell browser website from a \code{Seurat} object # NULL #require(reticulate) #require(Matrix) #require(R.utils) #' Used by \code{ExportToCellbrowser}: #' Write a big sparse matrix to a .tsv.gz file by writing chunks, concating them with the Unix cat command, #' then gziping the result. This does not work on Windows, we'd have to use the copy /b command there. #' #' @param inMat input matrix #' @param outFname output file name, has to end with .gz #' @param sliceSize=1000, size of each chunk in number of lines #' #' @return Invisibly returns \code{NULL} #' #' @importFrom data.table setDTthreads data.table fwrite #' #' @examples #' \dontrun{ #' writeSparseTsvChunks( pbmc_small@data, "exprMatrix.tsv.gz") #' } #' writeSparseTsvChunks = function (inMat, outFname, sliceSize=1000) { fnames = c() setDTthreads(threads = 8) # otherwise this would use dozens of CPUs on a fat server mat = inMat geneCount = nrow(mat) message("Writing expression matrix to ", outFname) startIdx = 1 while (startIdx < geneCount) { endIdx <- min(startIdx+sliceSize-1, geneCount) matSlice <- mat[startIdx:endIdx,] denseSlice <- as.matrix(x = matSlice) dt <- data.table(denseSlice) dt <- cbind(gene = rownames(x = matSlice), dt) writeHeader <- startIdx == 1 sliceFname <- paste0("temp", startIdx,".txt") fwrite(dt, sep="\t", file=sliceFname, quote = FALSE, col.names = writeHeader) fnames <- append(x = fnames, values = sliceFname); startIdx <- startIdx + sliceSize } message("Concatenating chunks") system(command = paste( "cat", paste(fnames, collapse=" "), "| gzip >", outFname, sep = " " )) unlink(x = fnames) return(invisible(x = NULL)) } #' used by ExportToCellbrowser: #' Return a matrix object from a Seurat object or show an error message #' #' @param object Seurat object #' @param matrix.slot the name of the slot #' findMatrix = function(object, matrix.slot ) { if (matrix.slot == "counts") { counts <- GetAssayData(object = object, slot = "counts") } else if (matrix.slot == "scale.data") { counts <- GetAssayData(object = object, slot="scale.data") } else if (matrix.slot=="data") { counts <- GetAssayData(object = object) } else { stop("matrix.slot can only be one of: counts, scale.data, data") } } #' Export \code{Seurat} objects for UCSC cell browser and stop open cell browser #' instances from R #' #' @param object Seurat object #' @param dir path to directory where to save exported files. These are: #' exprMatrix.tsv, tsne.coords.tsv, meta.tsv, markers.tsv and a default #' cellbrowser.conf #' @param dataset.name name of the dataset. Defaults to Seurat project name #' @param reductions vector of reduction names to export, defaults to all reductions. #' @param markers.file path to file with marker genes. By defaults, marker #' are searched in the object itself as misc$markers. If none are supplied in #' object or via this argument, they are recalculated with \code{FindAllMarkers} #' @param markers.n if no markers were supplied, FindAllMarkers is run. #' This parameter indicates how many markers to calculate, default is 100 #' @param matrix.slot matrix to use, default is 'counts' #' @param use.mtx export the matrix in .mtx.gz format. Default is False, #' unless the matrix is bigger than R's maximum matrix size. #' @param cluster.field name of the metadata field containing cell cluster #' @param cb.dir path to directory where to create UCSC cellbrowser static #' website content root, e.g. an index.html, .json files, etc. These files #' can be copied to any webserver. If this is specified, the cellbrowser #' package has to be accessible from R via reticulate. #' @param meta.fields vector of meta fields to export, default is all. #' @param meta.fields.names vector meta field names to show in UI. Must have #' same length as meta.fields. Default is meta.fields. #' @param skip.markers whether to skip exporting markers #' @param skip.expr.matrix whether to skip exporting expression matrix #' @param skip.metadata whether to skip exporting metadata #' @param skip.reductions whether to skip exporting reductions #' @param port on which port to run UCSC cellbrowser webserver after export #' @param ... specifies the metadata fields to export. To supply a field and its #' human readable name, pass name as \code{field="name"} parameter. #' #' @return This function exports Seurat object as a set of tsv files #' to \code{dir} directory, copying the \code{markers.file} if it is #' passed. It also creates the default \code{cellbrowser.conf} in the #' directory. This directory could be read by \code{cbBuild} to #' create a static website viewer for the dataset. If \code{cb.dir} #' parameter is passed, the function runs \code{cbBuild} (if it is #' installed) to create this static website in \code{cb.dir} directory. #' If \code{port} parameter is passed, it also runs the webserver for #' that directory and opens a browser. #' #' @author Maximilian Haeussler, Nikolay Markov #' #' @importFrom tools file_ext #' @importFrom utils browseURL packageVersion write.table #' @importFrom R.utils gzip #' @importFrom reticulate py_module_available import #' @importFrom Seurat Project Idents GetAssayData Embeddings FetchData #' @importFrom Matrix writeMM #' #' @export #' #' @name CellBrowser #' @rdname CellBrowser #' #' @importFrom methods slot #' @importFrom utils packageVersion #' @importFrom reticulate py_module_available import #' #' @examples #' \dontrun{ #' ExportToCellbrowser(pbmc_small, dataset.name = "PBMC", dir = "out") #' } #' ExportToCellbrowser <- function( object, dir, dataset.name = Project(object = object), reductions = NULL, markers.file = NULL, cluster.field = NULL, cb.dir = NULL, port = NULL, use.mtx = FALSE, meta.fields = NULL, meta.fields.names = NULL, matrix.slot = "counts", markers.n = 100, skip.markers = FALSE, skip.expr.matrix = FALSE, skip.metadata = FALSE, skip.reductions = FALSE ) { if (!requireNamespace("Seurat", quietly = TRUE)) { stop("This script requires that Seurat (V2 or V3) is installed") } message("Seurat Version installed: ", packageVersion("Seurat")) message("Object was created with Seurat version ", object@version) objMaj = package_version(object@version)$major pkgMaj = package_version(packageVersion("Seurat"))$major if (objMaj!=2 && objMaj!=3) { stop("can only process Seurat2 or Seurat3 objects, object was made with Seurat ", object@version) } if (objMaj != pkgMaj) { stop("The installed major version of Seurat is different from Seurat input object. You have to down- or upgrade your installed Seurat version. See the Seurat documentation.") } reducNames = reductions # compatibility layer for Seurat 2 vs 3 # see https://satijalab.org/seurat/essential_commands.html if (inherits(x = object, what = 'seurat')) { # Seurat v2 objects are called "seurat" (Paul Hoffman) # -> Seurat 2 data access idents <- object@ident # Idents() in Seurat3 meta <- object@meta.data cellOrder <- object@cell.names if (matrix.slot=="counts") { counts <- object@raw.data } else if (matrix.slot=="scale.data") { counts <- object@scale.data } else if (matrix.slot=="data") { counts <- object@data } else { error("matrix.slot can only be one of: counts, scale.data, data") } genes <- rownames(x = object@data) dr <- object@dr } else { # Seurat 3 functions idents <- Idents(object = object) meta <- object[[]] cellOrder <- colnames(x = object) counts <- findMatrix(object = object, matrix.slot = matrix.slot) if (dim(x = counts)[1] == 0) { message(paste0("The Seurat data slot '", matrix.slot, "' contains no data. Trying default assay.")) defAssay <- DefaultAssay(object) assay <- GetAssay(object, defAssay) message(paste0("Default assay is ", defAssay)) counts <- findMatrix(assay, matrix.slot) genes <- rownames(counts) if (dim(x = counts)[1] == 0) { stop( "Could not find an expression matrix", "Please select the correct slot where the matrix is stored, possible ", "values are 'counts', 'scale.data' or 'data'. To select a slot, ", "use the option 'matrix.slot' from R or the cbImportSeurat option -s from the command line." ) } } else { genes <- rownames(x = object) } dr <- object@reductions } if (is.null(x = cluster.field)) { cluster.field = "Cluster" } if (is.null(x = meta.fields)) { meta.fields <- colnames(x = meta) if (length(x = levels(x = idents)) > 1) { meta.fields <- c(meta.fields, ".ident") } } if (!is.null(x = port) && is.null(x = cb.dir)) { stop("cb.dir parameter is needed when port is set") } if (!dir.exists(paths = dir)) { dir.create(path = dir) } if (!dir.exists(paths = dir)) { stop("Output directory ", dir, " cannot be created or is a file") } if (dataset.name == "SeuratProject") { warning("Using default project name means that you may overwrite project with the same name in the cellbrowser html output folder") } enum.fields <- c() # Export expression matrix if (!skip.expr.matrix) { too.big = ((((ncol(counts)/1000)*(nrow(counts)/1000))>2000) && is(counts, 'sparseMatrix')) if (use.mtx || (too.big && (.Platform$OS.type=="windows"))) { # we have to write the matrix to an mtx file matrixPath <- file.path(dir, "matrix.mtx") genesPath <- file.path(dir, "features.tsv") barcodesPath <- file.path(dir, "barcodes.tsv") message("Writing expression matrix to ", matrixPath) writeMM(counts, matrixPath) # easier to load if the genes file has at least two columns. Even though seurat objects # don't have yet explicit geneIds/geneSyms data, we just duplicate whatever the matrix has now write.table(as.data.frame(cbind(rownames(counts), rownames(counts))), file=genesPath, sep="\t", row.names=F, col.names=F, quote=F) write(colnames(counts), file = barcodesPath) message("Gzipping expression matrix") gzip(matrixPath) gzip(genesPath) gzip(barcodesPath) } else { # we can write the matrix as a tsv file gzPath <- file.path(dir, "exprMatrix.tsv.gz") if (too.big) { writeSparseTsvChunks(counts, gzPath); } else { mat = as.matrix(counts) df <- as.data.frame(mat, check.names=FALSE) df <- data.frame(gene=genes, df, check.names = FALSE) z <- gzfile(gzPath, "w") message("Writing expression matrix to ", gzPath) write.table(x = df, sep="\t", file=z, quote = FALSE, row.names = FALSE) close(con = z) } } } # Export cell embeddings/reductions if (is.null(reducNames)) { reducNames = names(dr) message("Using all embeddings contained in the Seurat object: ", reducNames) } foundEmbedNames = c() for (embedding in reducNames) { emb <- dr[[embedding]] if (is.null(x = emb)) { message("Embedding ", embedding, " does not exist in Seurat object. Skipping. ") next } df <- emb@cell.embeddings if (ncol(x = df) > 2) { warning('Embedding ', embedding, ' has more than 2 coordinates, taking only the first 2') df <- df[, 1:2] } colnames(x = df) <- c("x", "y") df <- data.frame(cellId = rownames(x = df), df, check.names = FALSE) fname <- file.path( dir, sprintf("%s.coords.tsv", embedding) ) message("Writing embeddings to ", fname) write.table(df[cellOrder, ], sep="\t", file=fname, quote = FALSE, row.names = FALSE) foundEmbedNames = append(foundEmbedNames, embedding) } # by default, the embeddings are sorted in the object by order of creation (pca, tsne, umap). # But that is usually the opposite of what users want, they want the last embedding to appear first # in the UI, so reverse the order here foundEmbedNames = sort(foundEmbedNames, decreasing=T) embeddings.conf <- c() for (embedName in foundEmbedNames) { conf <- sprintf( '{"file": "%s.coords.tsv", "shortLabel": "Seurat %1$s"}', embedName ) embeddings.conf <- c(embeddings.conf, conf) } # Export metadata df <- data.frame(row.names = cellOrder, check.names = FALSE) for (field in meta.fields) { if (field == ".ident") { df$Cluster <- idents enum.fields <- c(enum.fields, "Cluster") } else { name <- meta.fields.names[[field]] if (is.null(name)) { name <- field } df[[name]] <- meta[[field]] if (!is.numeric(df[[name]])) { enum.fields <- c(enum.fields, name) } } } df <- data.frame(Cell = rownames(df), df, check.names = FALSE) fname <- file.path(dir, "meta.tsv") message("Writing meta data to ", fname) write.table(as.matrix(df[cellOrder, ]), sep = "\t", file = fname, quote = FALSE, row.names = FALSE) # Export markers markers.string <- '' if (is.null(markers.file)) { ext <- "tsv" } else { ext <- tools::file_ext(markers.file) } file <- paste0("markers.", ext) fname <- file.path(dir, file) if (!is.null(markers.file) && !skip.markers) { message("Copying ", markers.file, " to ", fname) file.copy(markers.file, fname) } if (is.null(markers.file) && skip.markers) { file <- NULL } if (is.null(markers.file) && !skip.markers) { if (length(levels(idents)) > 1) { markers.helper <- function(x) { partition <- markers[x,] ord <- order(partition$p_val_adj < 0.05, -partition$avg_logFC) res <- x[ord] naCount <- max(0, length(x) - markers.n) res <- c(res[1:markers.n], rep(NA, naCount)) return(res) } if (.hasSlot(object, "misc") && !is.null(x = object@misc["markers"][[1]])) { message("Found precomputed markers in obj@misc['markers']") markers <- object@misc["markers"]$markers } else { message("Running FindAllMarkers(), using wilcox test, min logfc diff 0.25") markers <- FindAllMarkers( object, do.print = TRUE, print.bar = TRUE, test.use = "wilcox", logfc.threshold = 0.25 ) } message("Writing top ", markers.n, ", cluster markers to ", fname) markers.order <- ave(x = rownames(x = markers), markers$cluster, FUN = markers.helper) top.markers <- markers[markers.order[!is.na(x = markers.order)], ] write.table(x = top.markers, file = fname, quote = FALSE, sep = "\t", col.names = NA) } else { message("No clusters found in Seurat object and no external marker file provided, so no marker genes can be computed") file <- NULL } } if (!is.null(file)) { markers.string <- sprintf( 'markers = [{"file": "%s", "shortLabel": "Seurat Cluster Markers"}]', file ) } matrixOutPath <- "exprMatrix.tsv.gz" if (use.mtx) { matrixOutPath <- "matrix.mtx.gz" } config <- ' # This is a bare-bones cellbrowser config file auto-generated from R. # Look at https://github.com/maximilianh/cellBrowser/blob/master/src/cbPyLib/cellbrowser/sampleConfig/cellbrowser.conf # for a full file that shows all possible options name="%s" shortLabel="%1$s" exprMatrix="%s" #tags = ["10x", "smartseq2"] meta="meta.tsv" # possible values: "gencode-human", "gencode-mouse", "symbol" or "auto" geneIdType="auto" # file with gene,description (one per line) with highlighted genes, called "Dataset Genes" in the user interface # quickGenesFile="quickGenes.csv" clusterField="%s" labelField="%s" enumFields=%s %s coords=%s' enum.string <- paste0( "[", paste(paste0('"', enum.fields, '"'), collapse = ", "), "]" ) coords.string <- paste0( "[", paste(embeddings.conf, collapse = ",\n"), "]" ) config <- sprintf( config, dataset.name, matrixOutPath, cluster.field, cluster.field, enum.string, markers.string, coords.string ) confPath = file.path(dir, "cellbrowser.conf") message("Writing cellbrowser config to ", confPath) cat(config, file = confPath) message("Prepared cellbrowser directory ", dir) if (!is.null(x = cb.dir)) { if (!py_module_available(module = "cellbrowser")) { stop( "The Python package `cellbrowser` is required to prepare and run ", "Cellbrowser. Please install it ", "on the Unix command line with `sudo pip install cellbrowser` (if root) ", "or `pip install cellbrowser --user` (as a non-root user). ", "To adapt the Python that is used, you can either set the env. variable RETICULATE_PYTHON ", "or do `require(reticulate) and use one of these functions: use_python(), use_virtualenv(), use_condaenv(). ", "See https://rstudio.github.io/reticulate/articles/versions.html; ", "at the moment, R's reticulate is using this Python: ", import(module = 'sys')$executable, ". " ) } if (!is.null(x = port)) { port <- as.integer(x = port) } message("Converting cellbrowser directory to html/json files") cb <- import(module = "cellbrowser") cb$cellbrowser$build(dir, cb.dir) message("HTML files are ready in ", cb.dir) if (!is.null(port)) { message("Starting http server") cb$cellbrowser$stop() cb$cellbrowser$serve(cb.dir, port) Sys.sleep(time = 0.4) browseURL(url = paste0("http://localhost:", port)) } } } #' Stop Cellbrowser web server #' #' @export #' #' @importFrom reticulate py_module_available #' @importFrom reticulate import #' #' @examples #' \dontrun{ #' StopCellbrowser() #' } #' StopCellbrowser <- function() { if (py_module_available("cellbrowser")) { cb <- import("cellbrowser") cb$cellbrowser$stop() } else { stop("The `cellbrowser` package is not available in the Python used by R's reticulate") } } ================================================ FILE: R/cogaps.R ================================================ #' @include internal.R #' NULL #' Run CoGAPs on a Seurat object #' #' @param object Seurat object #' @param assay Assay to pull data from #' @param slot Slot to pull data from. #' @param params \code{\link[CoGAPS]{CogapsParams}} object for specifying parameter settings #' @param temp.file Name of temporary data matrix file to create if running in a distributed mode. #' Setting to TRUE will generate the file name using \code{tempfile}. #' @param reduction.name Name of the CoGAPS reduction returned #' @param reduction.key Key for the CoGAPS reduction returned #' #' @return Returns a Seurat object with the CoGAPS results stored as a \code{\link{DimReduc}} object #' @seealso \code{\link[CoGAPS]{CoGAPS}} #' @references E.J. Fertig, J. Ding, A.V. Favorov, G. Parmigiani, and M.F. Ochs (2010) CoGAPS: an #' integrated R/C++ package to identify overlapping patterns of activation of biological processes #' from expression data. Bioinformatics 26:2792-2793. #' #' @importFrom Matrix writeMM #' #' @export RunCoGAPS <- function( object, assay = NULL, slot = "counts", params = NULL, temp.file = NULL, reduction.name = "CoGAPS", reduction.key = "CoGAPS_", ... ) { SeuratWrappers:::CheckPackage(package = 'CoGAPS', repository = 'bioconductor') assay <- assay %||% DefaultAssay(object = object) dat <- GetAssayData(object = object, assay = assay, slot = slot) dat <- log2(x = dat + 1) geneNames <- rownames(x = dat) sampleNames <- colnames(x = dat) if (!is.null(temp.file)) { if (isTRUE(x = temp.file)) { temp.file <- paste0(tempfile(), ".mtx") } else if (file.exists(temp.file)) { stop("temp.file already exists and would be overwritten. Please either remove or specify a new name.") } dat <- as(object = dat, Class = "dgCMatrix") Matrix::writeMM(obj = dat, file = temp.file) dat <- temp.file } else { dat <- as.matrix(x = dat) } if (!is.null(x = params)) { CoGAPS_results <- CoGAPS::CoGAPS( data = dat, params = params, geneNames = geneNames, sampleNames = sampleNames, ... ) } else { CoGAPS_results <- CoGAPS::CoGAPS( data = dat, geneNames = geneNames, sampleNames = sampleNames, ... ) } object[["CoGAPS"]] <- CreateDimReducObject( embeddings = slot(object = CoGAPS_results, name = "sampleFactors"), loadings = slot(object = CoGAPS_results, name = "featureLoadings"), key = "CoGAPS_", assay = assay ) return(object) } ================================================ FILE: R/conos.R ================================================ #' Extra conversions to Seurat objects #' #' @inheritParams Seurat::as.Seurat #' #' @rdname as.Seurat.extras #' @name as.Seurat #' #' @seealso \code{\link[Seurat]{as.Seurat}} #' #' @aliases as.Seurat #' NULL #' @param method Name of matching method graph was built using #' @param reduction Name of graph embedding, if calculated #' @param idents Name of clutering method to set as identity class #' #' @details #' The \code{Conos} method for \code{\link[Seurat]{as.Seurat}} only works if all #' samples are \code{Seurat} objects. The object is initially constructed by merging #' all samples together using \code{\link[Seurat]{merge}}, any sample-level dimensional #' reductions and graphs will be lost during the merge. Extra information is added #' to the resulting Seurat object as follows: #' \itemize{ #' \item Pairwise alignments will be stored in miscellaneous data, as will any #' other miscellaneous information #' \item If a graph is present in the \code{graph} field, it will be stored as #' a \code{Graph} object, reordered to match cell order in the new \code{Seurat} #' object. It will be named "\code{DefaultAssay(SeuratObject)}_\code{method}" #' \item If an embedding is present in the \code{embedding} field as a #' \code{\link{matrix}}, it will be stored as a \code{DimReduc} object with the #' name \code{reduction} and a key value of "\code{toupper(reduction)}_" #' \item If the length of the \code{clusters} field is greater than zero, #' clustering information (\code{groups} field) will be added to object metadata. #' Extra information (\code{result} field) will be added to miscellaneous data #' with the name "conos.\code{clustering}.result" #' \item If present, the first clustering entry in the \code{clusters} field #' will be set as object identity classes #' } #' #' @importFrom igraph get.adjacency #' @importFrom utils txtProgressBar setTxtProgressBar #' @importFrom Seurat as.Seurat Misc<- DefaultAssay as.Graph #' CreateDimReducObject Idents<- #' #' @rdname as.Seurat.extras #' @export #' @method as.Seurat Conos #' as.Seurat.Conos <- function( x, method = 'mnn', reduction = 'largeVis', idents = names(x = x$clusters)[1], verbose = TRUE, ... ) { if (!all(sapply(X = x$samples, FUN = inherits, what = 'Seurat'))) { stop( "Converting a Conos object to a Seurat object requires that all samples are Seurat v3 objects", call. = FALSE ) } if (verbose) { message("Merging ", length(x = x$samples), " samples") } object <- merge(x = x$samples[[1]], x$samples[2:length(x = x$samples)]) # Add pairs if (length(x = x$pairs) > 0) { if (verbose) { message("Adding pairwise alignments to 'conos.pairs' in miscellaneous data") } Misc(object = object, slot = 'conos.pairs') <- x$pairs } # Add graph if (!is.null(x = x$graph)) { graph <- paste(DefaultAssay(object = object), method, sep = '_') message("Adding graph as '", graph, "'") object[[graph]] <- as.Graph(x = get.adjacency(graph = x$graph)[colnames(x = object), colnames(x = object)]) } # Add graph embedding if (is.matrix(x = x$embedding)) { if (verbose) { message("Adding graph embedding as ", reduction) } object[[reduction]] <- suppressWarnings(expr = CreateDimReducObject( embeddings = x$embedding, assay = DefaultAssay(object = object), key = paste0(toupper(x = reduction), '_') )) } # Add clustering information if (length(x = x$clusters) > 0) { if (verbose) { message("Adding clustering information") pb <- txtProgressBar(min = 0, max = length(x = x$clusters), style = 3, file = stderr()) } for (clustering in names(x = x$clusters)) { object[[clustering]] <- x$clusters[[clustering]]$groups clustering.misc <- paste('conos', clustering, 'result', sep = '.') Misc(object = object, slot = clustering.misc) <- x$clusters[[clustering]]$result if (clustering == idents) { Idents(object = object) <- clustering } if (verbose) { setTxtProgressBar(pb = pb, value = 1 + pb$getVal()) } } } if (verbose) { close(con = pb) } # Add miscellaneous information if (length(x = x$misc) > 0) { if (verbose) { message("Adding extra information to 'conos.misc' in miscellaneous data") } Misc(object = object, slot = 'conos.misc') <- x$misc } return(object) } ================================================ FILE: R/fast_mnn.R ================================================ #' @include internal.R #' NULL #' Run fastMNN #' #' @param object.list A list of Seurat objects #' @param assay Assay to use, defaults to the default assay of the first object #' @param features Either a list of features to use when calculating batch #' correction, or a number (2000 by default) of variable features to select. #' @param reduction.name Name to store resulting DimReduc object as #' @param reduction.key Key for resulting DimReduc #' @param reconstructed.assay Name for the assay containing the low-rank #' reconstruction of the expression matrix. #' @param verbose Print messages from \code{\link[Seurat]{SelectIntegrationFeatures}} #' @param ... Extra parameters passed to \code{\link[batchelor]{fastMNN}} #' #' @return A Seurat object merged from the objects in \code{object.list} and a #' new DimReduc of name \code{reduction.name} (key set to \code{reduction.key}) #' with corrected embeddings matrix as well as the rotation matrix used for the #' PCA stored in the feature loadings slot. Also returns an expression matrix #' reconstructed from the low-rank approximation in the #' \code{reconstructed.assay} assay; all other metadata info #' \code{\link[batchelor]{fastMNN}} is stored in the \code{tool} slot, #' accessible with \code{\link[Seurat]{Tool}} #' #' @importFrom Seurat DefaultAssay DefaultAssay<- SelectIntegrationFeatures VariableFeatures VariableFeatures<- #' as.SingleCellExperiment CreateDimReducObject Tool<- LogSeuratCommand #' #' @export #' #' @seealso \code{\link[batchelor]{fastMNN}} \code{\link[Seurat]{Tool}} #' RunFastMNN <- function( object.list, assay = NULL, features = 2000, reduction.name = "mnn", reduction.key = "mnn_", reconstructed.assay = "mnn.reconstructed", verbose = TRUE, ... ) { CheckPackage(package = "batchelor", repository = "bioconductor") if (!all(sapply(X = object.list, FUN = inherits, what = "Seurat"))) { stop("'object.list' must be a list of Seurat objects", call. = FALSE) } if (length(x = object.list) < 2) { stop("'object.list' must contain multiple Seurat objects for integration", call. = FALSE) } assay <- assay %||% DefaultAssay(object = object.list[[1]]) for (i in 1:length(x = object.list)) { DefaultAssay(object = object.list[[i]]) <- assay } if (is.numeric(x = features)) { if (verbose) { message(paste("Computing", features, "integration features")) } features <- SelectIntegrationFeatures( object.list = object.list, nfeatures = features, assay = rep(assay, length(x = object.list)) ) } objects.sce <- lapply( X = object.list, FUN = function(x, f) { return(as.SingleCellExperiment(x = subset(x = x, features = f))) }, f = features ) integrated <- merge( x = object.list[[1]], y = object.list[2:length(x = object.list)] ) out <- do.call( what = batchelor::fastMNN, args = c( objects.sce, list(...) ) ) rownames(x = SingleCellExperiment::reducedDim(x = out)) <- colnames(x = integrated) colnames(x = SingleCellExperiment::reducedDim(x = out)) <- paste0(reduction.key, 1:ncol(x = SingleCellExperiment::reducedDim(x = out))) integrated[[reduction.name]] <- CreateDimReducObject( embeddings = SingleCellExperiment::reducedDim(x = out), loadings = as.matrix(SingleCellExperiment::rowData(x = out)), assay = DefaultAssay(object = integrated), key = reduction.key ) # Add reconstructed matrix (gene x cell) integrated[[reconstructed.assay]] <- CreateAssayObject( data = as(object = SummarizedExperiment::assay(x = out), Class = "sparseMatrix"), ) # Add variable features VariableFeatures(object = integrated[[reconstructed.assay]]) <- features Tool(object = integrated) <- S4Vectors::metadata(x = out) integrated <- LogSeuratCommand(object = integrated) return(integrated) } ================================================ FILE: R/fast_mnn_v5.R ================================================ #' @include internal.R #' NULL #' Run fastMNN in Seurat 5 #' #' @param object A merged seurat object #' @param groups A one-column data frame with grouping information #' @param layers Layers to use #' @param assay Assay to use, defaults to the default assay of the first object #' @param features Either a list of features to use when calculating batch #' correction, or a number (2000 by default) of variable features to select. #' @param reduction.name Name to store resulting DimReduc object as #' @param reduction.key Key for resulting DimReduc #' @param reconstructed.assay Name for the assay containing the low-rank #' reconstruction of the expression matrix. #' @param verbose Print messages #' @param ... Extra parameters passed to \code{\link[batchelor]{fastMNN}} #' #' @return A Seurat object merged from the objects in \code{object.list} and a #' new DimReduc of name \code{reduction.name} (key set to \code{reduction.key}) #' with corrected embeddings matrix as well as the rotation matrix used for the #' PCA stored in the feature loadings slot. Also returns an expression matrix #' reconstructed from the low-rank approximation in the #' \code{reconstructed.assay} assay; all other metadata info #' \code{\link[batchelor]{fastMNN}} is stored in the \code{tool} slot, #' accessible with \code{\link[Seurat]{Tool}} #' #' @importFrom Seurat DefaultAssay DefaultAssay<- SelectIntegrationFeatures VariableFeatures VariableFeatures<- #' as.SingleCellExperiment CreateDimReducObject Tool<- LogSeuratCommand #' @importFrom rlang check_installed #' #' @export #'@note This function requires the #' \href{https://rdrr.io/github/LTLA/batchelor/}{\pkg{batchelor}} package #' to be installed #' #' @examples #' \dontrun{ #' # Preprocessing #' obj <- SeuratData::LoadData("pbmcsca") #' obj[["RNA"]] <- split(obj[["RNA"]], f = obj$Method) #' obj <- NormalizeData(obj) #' obj <- FindVariableFeatures(obj) #' obj <- ScaleData(obj) #' obj <- RunPCA(obj) #' #' # After preprocessing, we integrate layers: #' obj <- IntegrateLayers(object = obj, method = FastMNNIntegration, #' new.reduction = 'integrated.mnn', verbose = FALSE) #' #' # We can also add parameters specific to FastMNN. #' # Here we set `k` to specify the number of nearest neighbors to use when identifying MNNs: #' obj <- IntegrateLayers(object = obj, method = FastMNNIntegration, #' new.reduction = 'integrated.mnn', k = 15, verbose = FALSE) #' } #' #' @seealso \code{\link[batchelor]{fastMNN}} \code{\link[Seurat]{Tool}} #' FastMNNIntegration <- function( object, assay = NULL, orig = NULL, groups = NULL, layers = NULL, scale.layer = NULL, features = 2000, new.reduction = "integrated.mnn", reduction.key = "mnn_", reconstructed.assay = "mnn.reconstructed", verbose = TRUE, ... ) { check_installed( pkg = "batchelor", reason = "for running integration with mnnCorrect" ) object <- CreateSeuratObject(object) if (is.numeric(x = features)) { if (verbose) { message(paste("Computing", features, "integration features")) } features <- SelectIntegrationFeatures5(object = object, features = features) } layers <- layers %||% Layers(object, search = 'data') if (verbose) { message("Converting layers to SingleCellExperiment") } objects.sce <- lapply( X = layers, FUN = function(x, f) { return(as.SingleCellExperiment( x = subset(x = object, features = f, cells = colnames(LayerData(object, layer = x))) ) ) }, f = features ) if (verbose) { message("Running fastMNN") } out <- do.call( what = batchelor::fastMNN, args = c( objects.sce, list(...) ) ) colnames(x = SingleCellExperiment::reducedDim(x = out)) <- paste0(reduction.key, 1:ncol(x = SingleCellExperiment::reducedDim(x = out))) reduction <- CreateDimReducObject( embeddings = SingleCellExperiment::reducedDim(x = out), loadings = as.matrix(SingleCellExperiment::rowData(x = out)), assay = DefaultAssay(object = object), key = reduction.key ) # Add reconstructed matrix (gene x cell) reconstructed_assay <- CreateAssayObject( data = as(object = SummarizedExperiment::assay(x = out), Class = "sparseMatrix"), ) # Add variable features VariableFeatures(object = reconstructed_assay) <- features #Tool(object = object) <- S4Vectors::metadata(x = out) #object <- LogSeuratCommand(object = object) output.list <- list(reduction, reconstructed_assay) names(output.list) <- c(new.reduction, reconstructed.assay) return(output.list) } attr(x = FastMNNIntegration, which = 'Seurat.method') <- 'integration' ================================================ FILE: R/glmpca.R ================================================ #' @include internal.R #' NULL #' Run GLMPCA #' #' @param object A Seurat object #' @param L The number of dimensions to return (defaults to 5) #' @param assay Assay to use, defaults to the default assay #' @param features A list of features to use when performing GLM-PCA. If null, defaults to variable features. #' @param reduction.name Name to store resulting DimReduc object as. Defaults to glmpca #' @param reduction.key Key for resulting DimReduc. Defaults to GLMPC_ #' @param ... Extra parameters passed to \code{\link[glmpca]{glmpca}} #' #' @return A Seurat object containing the output of GLMPCA stored as a DimReduc object. #' @importFrom Seurat DefaultAssay DefaultAssay<- CreateDimReducObject Tool<- LogSeuratCommand #' #' @author Will Townes #' @references Townes, W., Hicks, SC, Aryee, MJ, Irizarry, RA. (2019). "Feature selection and dimension reduction for single-cell RNA-Seq based on a multinomial model." #' Genome Biology. #' #' @examples #' \dontrun{ #' pbmc_small #' pbmc_small <- RunGLMPCA(pbmc_small) #' DimPlot(pbmc_small, redunction = 'glmpca') #' } #' #' @export #' RunGLMPCA <- function( object, L = 5, assay = NULL, features = NULL, reduction.name = 'glmpca', reduction.key = 'GLMPC_', verbose = TRUE, ... ) { CheckPackage(package = 'glmpca', repository = 'CRAN') if (!inherits(x = object,what = 'Seurat')) { stop("'object' must be a Seurat object", call. = FALSE) } assay <- assay %||% DefaultAssay(object = object) DefaultAssay(object = object) <- assay features <- features %||% VariableFeatures(object) data <- GetAssayData(object = object, slot = 'counts') features <- intersect(x = features, y = rownames(x = data)) if (length(x = features) == 0) { stop("Please specify a subset of features for GLM-PCA") } data <- data[features, ] glmpca_results <- glmpca:::glmpca(Y = data, L = L, ...) glmpca_dimnames <- paste0(reduction.key, 1:L) factors<-as.matrix(glmpca_results$factors) loadings<-as.matrix(glmpca_results$loadings) colnames(x = factors) <- glmpca_dimnames colnames(x = loadings) <- glmpca_dimnames factors_l2_norm <- sqrt(colSums(factors^2)) #strip S3 class "glmpca" to enable it to pass validObject() class(glmpca_results)<-NULL #save memory by removing factors and loadings since they are stored separately glmpca_results$factors<-glmpca_results$loadings<-NULL object[[reduction.name]] <- CreateDimReducObject( embeddings = factors, key = reduction.key, loadings = loadings, stdev = factors_l2_norm, assay = assay, global = TRUE, misc = glmpca_results ) object <- LogSeuratCommand(object = object) return(object) } ================================================ FILE: R/internal.R ================================================ #' @importFrom BiocManager install #' @importFrom remotes install_github #' @importFrom Seurat IsGlobal Reductions #' NULL #' @docType package #' @name SeuratWrappers-package #' @rdname SeuratWrappers-package #' "_PACKAGE" # Set a default value if an object is null # # @param lhs An object to set if it's null # @param rhs The value to provide if x is null # # @return rhs if lhs is null, else lhs # # @author Hadley Wickham # @references https://adv-r.hadley.nz/functions.html#missing-arguments # `%||%` <- function(lhs, rhs) { if (!is.null(x = lhs)) { return(lhs) } else { return(rhs) } } # Set a default value if an object is NOT null # # @param lhs An object to set if it's NOT null # @param rhs The value to provide if x is NOT null # # @return lhs if lhs is null, else rhs # # @author Hadley Wickham # @references https://adv-r.hadley.nz/functions.html#missing-arguments # `%iff%` <- function(lhs, rhs) { if (!is.null(x = lhs)) { return(rhs) } else { return(lhs) } } # Get dimensional reduction information associated with an assay # # @param object A \code{Seurat} object # @param assay Name of assay that dimensional reduction objects should be # associated with # @param global Include global dimensional reductions # # @return A vector of dimensional reduction names # # @keywords internal # AssociatedDimReducs <- function( object, assay = DefaultAssay(object = object), global = TRUE ) { return(Filter( f = function(x) { check <- DefaultAssay(object = object[[x]]) == assay if (global) { check <- c(check, IsGlobal(object = object[[x]])) } return(any(check)) }, x = Reductions(object = object) )) } # Find the default DimReduc # # Searches for DimReducs matching 'umap', 'tsne', or 'pca', case-insensitive, and # in that order. Priority given to DimReducs matching the DefaultAssay or assay specified # (eg. 'pca' for the default assay weights higher than 'umap' for a non-default assay) # # @param object A Seurat object # @param assay Name of assay to use; defaults to the default assay of the object # # @return The default DimReduc, if possible # # DefaultDimReduc <- function(object, assay = NULL) { assay <- assay %||% DefaultAssay(object = object) drs.use <- c('umap', 'tsne', 'pca') dim.reducs <- Reductions(object = object) drs.assay <- Filter( f = function(x) { return(DefaultAssay(object = object[[x]]) == assay) }, x = dim.reducs ) if (length(x = drs.assay) > 0) { index <- lapply( X = drs.use, FUN = grep, x = drs.assay, ignore.case = TRUE ) index <- Filter(f = length, x = index) if (length(x = index) > 0) { return(drs.assay[min(index[[1]])]) } } index <- lapply( X = drs.use, FUN = grep, x = dim.reducs, ignore.case = TRUE ) index <- Filter(f = length, x = index) if (length(x = index) < 1) { stop( "Unable to find a DimReduc matching one of '", paste(drs.use[1:(length(x = drs.use) - 1)], collapse = "', '"), "', or '", drs.use[length(x = drs.use)], "', please specify a dimensional reduction to use", call. = FALSE ) } return(dim.reducs[min(index[[1]])]) } # Check to ensure a package is installed # # @param package Name of pacakge to check # @param repository Repository that package is available on; # choose from 'bioconductor', 'github', or 'cran' # @param ... Extra parameters passed to BiocManager::install, remotes::install_github, or install.packages, depending on \code{repository} # #' @importFrom utils menu install.packages # CheckPackage <- function(package, repository, ...) { if (!requireNamespace(package = basename(path = package), quietly = TRUE)) { if (interactive()) { message("Package ", package, " is not yet installed") message("Install now?") choice <- menu(choices = c('yes', 'no')) if (choice == 1) { repository <- match.arg( arg = tolower(x = repository), choices = c('github', 'bioconductor', 'cran') ) switch( EXPR = repository, 'github' = remotes::install_github(repo = package, ...), 'bioconductor' = BiocManager::install(pkgs = package, ...), 'cran' = install.packages(pkgs = package, ...), stop("Unknown repository ", repository, call. = FALSE) ) return(invisible(x = NULL)) } } stop("Unable to find package ", package, ", please install", call. = FALSE) } } # Check if a matrix is empty # # Takes a matrix and asks if it's empty (either 0x0 or 1x1 with a value of NA) # # @param x A matrix # # @return Whether or not \code{x} is empty # IsMatrixEmpty <- function(x) { matrix.dims <- dim(x = x) matrix.na <- all(matrix.dims == 1) && all(is.na(x = x)) return(all(matrix.dims == 0) || matrix.na) } ================================================ FILE: R/liger.R ================================================ #' @include internal.R #' NULL #' Run optimizeALS on a Seurat object #' #' @inheritParams rliger::optimizeALS #' @inheritParams RunFastMNN #' @param object A merged Seurat object #' @param split.by Attribute for splitting, defaults to "orig.ident" #' @param ... Arguments passed to other methods #' #' @return A Seurat object with embeddings and loadings from \code{\link[liger]{optimizeALS}} #' stored as a DimReduc object with name \code{reduction.name} (key set to \code{reduction.key}); #' per-dataset feature loadings matrices stored in the \code{tool} slot, accessible with #' \code{\link[Seurat]{Tool}} #' # @importFrom rliger optimizeALS #' @importFrom Seurat DefaultAssay SplitObject GetAssayData VariableFeatures #' CreateDimReducObject Tool<- LogSeuratCommand #' #' @aliases optimizeALS #' @seealso \code{\link[rliger]{optimizeALS}} \code{\link[Seurat]{Tool}} #' #' @export # @method optimizeALS Seurat #' RunOptimizeALS <- function( object, k, assay = NULL, split.by = 'orig.ident', lambda = 5, thresh = 1e-6, max.iters = 30, reduction.name = 'iNMF_raw', reduction.key = 'riNMF_', nrep = 1, H.init = NULL, W.init = NULL, V.init = NULL, rand.seed = 1, print.obj = FALSE, ... ) { CheckPackage(package = 'rliger', repository = 'cran') assay <- assay %||% DefaultAssay(object = object) if (IsMatrixEmpty(x = GetAssayData(object = object, slot = 'scale.data'))) { stop("Data is unscaled, splease scale before running", call. = FALSE) } if (is.character(x = split.by) && length(x = split.by) == 1) { split.by <- object[[split.by]] } split.cells <- split(x = colnames(x = object), f = split.by) scale.data <- lapply( X = split.cells, FUN = function(x) { return(t(x = GetAssayData( object = object, slot = 'scale.data', assay = assay )[, x])) } ) # scale.data <- sapply(X = scale.data, FUN = t, simplify = FALSE) out <- rliger::optimizeALS( object = scale.data, k = k, lambda = lambda, thresh = thresh, max.iters = max.iters, nrep = nrep, H.init = H.init, W.init = W.init, V.init = V.init, rand.seed = rand.seed, print.obj = print.obj ) colnames(x = out$W) <- VariableFeatures(object = object) object[[reduction.name]] <- CreateDimReducObject( embeddings = do.call(what = 'rbind', args = out$H), loadings = t(x = out$W), assay = assay, key = reduction.key ) Tool(object = object) <- sapply( X = out$V, FUN = function(x) { colnames(x = x) <- VariableFeatures(object = object) rownames(x = x) <- colnames(x = object[[reduction.name]]) return(t(x = x)) }, simplify = FALSE ) object <- LogSeuratCommand(object = object) return(object) } #' Generate shared factor neighborhood graph #' #' This is a deprecated function. Call 'RunQuantileNorm' instead. #' # @inheritParams rliger::SNF #' @inheritParams RunOptimizeALS #' @param reduction Name of reduction to use #' #' @return A Seurat object with the SNF list stored in the \code{tool} slot, #' accessible with \code{\link[Seurat]{Tool}} #' #' @importFrom Seurat SplitObject Embeddings Tool<- LogSeuratCommand #' #' @aliases SNF #' @seealso \code{\link[rliger]{RunQuantileNorm}} \code{\link[Seurat]{Tool}} #' #' @export # @method SNF Seurat #' RunSNF <- function( object, split.by = 'orig.ident', reduction = 'iNMF_raw', dims.use = NULL, dist.use = 'CR', center = FALSE, knn_k = 20, k2 = 500, small.clust.thresh = knn_k, ... ) { CheckPackage(package = 'rliger', repository = 'cran') # cells <- sapply( # X = SplitObject(object = object, split.by = split.by), # FUN = colnames, # simplify = FALSE # ) # dims.use <- dims.use %||% 1:length(x = object[[reduction]]) # embeddings <- sapply( # X = cells, # FUN = function(x) { # return(Embeddings(object = object[[reduction]])[x, ]) # }, # simplify = FALSE, # USE.NAMES = TRUE # ) # snf <- liger::SNF( # object = embeddings, # dims.use = dims.use, # dist.use = dist.use, # center = center, # knn_k = knn_k, # k2 = k2, # small.clust.thresh = small.clust.thresh, # ... # ) # Tool(object = object) <- snf # object <- LogSeuratCommand(object = object) # return(object) .Deprecated( new = 'RunQuantileNorm', msg = paste( "This is a deprecated function. Call 'RunQuantileNorm' instead." ) ) } #' Run quantileAlignSNF on a Seurat object #' #' This is a deprecated function. Call 'RunQuantileNorm' instead. #' #' @inheritParams RunSNF #' @inheritParams RunOptimizeALS #' @inheritParams rliger::quantileAlignSNF #' @param recalc.snf Recalculate \code{\link{SNF}} #' @param ... Arguments passed to other methods, and to #' \code{\link[seurat.wrappers]{SNF}} if \code{recalc.snf = TRUE} or #' \code{\link[seurat.wrappers]{SNF}} hasn't been run #' #' @return A Seurat object with embeddings from \code{\link[liger]{quantileAlignSNF}} #' stored as a DimReduc object with name \code{reduction.name} (key set to \code{reduction.key}) #' # @importFrom rliger quantileAlignSNF #' @importFrom Seurat Tool SplitObject Embeddings CreateDimReducObject #' DefaultAssay Tool<- Idents<- LogSeuratCommand #' #' @aliases quantileAlignSNF #' @seealso \code{\link[rliger]{RunQuantileNorm}} #' #' @export # @method quantileAlignSNF Seurat #' RunQuantileAlignSNF <- function( object, split.by = 'orig.ident', reduction = 'iNMF_raw', reduction.name = 'iNMF', reduction.key = 'iNMF_', recalc.snf = FALSE, ref_dataset = NULL, prune.thresh = 0.2, min_cells = 2, quantiles = 50, nstart = 10, resolution = 1, center = FALSE, id.number = NULL, print.mod = FALSE, print.align.summary = FALSE, ... ) { # CheckPackage(package = 'rliger', repository = 'cran') # if (recalc.snf || is.null(x = Tool(object = object, slot = 'RunSNF'))) { # object <- RunSNF( # object = object, # split.by = split.by, # reduction = reduction, # center = center, # ... # ) # } # embeddings <- sapply( # X = SplitObject(object = object, split.by = split.by), # FUN = function(x) { # return(Embeddings(object = x[[reduction]])) # }, # simplify = FALSE, # USE.NAMES = TRUE # ) # if (is.null(x = ref_dataset)) { # num.samples <- vapply( # X = embeddings, # FUN = nrow, # FUN.VALUE = integer(length = 1L) # ) # ref_dataset <- names(x = embeddings)[which.max(x = num.samples)] # } else if (is.numeric(x = ref_dataset)) { # ref_dataset <- names(x = embeddings)[ref_dataset] # } # if (is.character(x = ref_dataset) && !ref_dataset %in% names(x = embeddings)) { # stop("Cannot find reference dataset '", ref_dataset, "' in the split", call. = FALSE) # } # out <- rliger::quantileAlignSNF( # object = embeddings, # snf = Tool(object = object, slot = 'RunSNF'), # cell.names = colnames(x = object), # ref_dataset = ref_dataset, # prune.thresh = prune.thresh, # min_cells = min_cells, # quantiles = quantiles, # nstart = nstart, # resolution = resolution, # center = center, # id.number = id.number, # print.mod = print.mod, # print.align.summary = print.align.summary, # ... # ) # object[[reduction.name]] <- CreateDimReducObject( # embeddings = out$H.norm, # assay = DefaultAssay(object = object[[reduction]]), # key = reduction.key # ) # out <- as.data.frame(x = out[names(x = out) != 'H.norm']) # object[[colnames(x = out)]] <- out # Idents(object = object) <- 'clusters' # object <- LogSeuratCommand(object = object) # return(object) message(paste( "This is a deprecated function. Calling 'RunQuantileNorm' instead.", "Note that not all parameters can be passed to 'RunQuantileNorm'.", "It's suggested to run 'louvainCluster' subsequently as well." )) .Deprecated( new = 'RunQuantileNorm', msg = paste( "This is a deprecated function. Calling 'quantile_norm' instead.", "Note that not all parameters can be passed to 'quantile_norm'.", "It's suggested to run 'louvainCluster' subsequently as well." ) ) return(RunQuantileNorm(object, split.by = split.by, reduction = reduction, reduction.name = reduction.name, reduction.key = reduction.key, quantiles = quantiles, ref_dataset = NULL, min_cells = 20, knn_k = 20, dims.use = NULL, do.center = FALSE, max_sample = 1000, eps = 0.9, refine.knn = TRUE, ... )) } #' Run quantile_norm on a Seurat object #' #' @inheritParams RunOptimizeALS #' @inheritParams rliger::quantile_norm #' @param ... Arguments passed to other methods #' #' @return A Seurat object with embeddings from \code{\link[liger]{quantile_norm}} #' stored as a DimReduc object with name \code{reduction.name} (key set to \code{reduction.key}) #' # @importFrom rliger quantile_norm #' @importFrom Seurat Tool SplitObject Embeddings CreateDimReducObject #' DefaultAssay Tool<- Idents<- LogSeuratCommand #' #' @aliases quantile_norm #' @seealso \code{\link[rliger]{quantile_norm}} #' #' @export # @method quantile_norm Seurat #' RunQuantileNorm <- function( object, split.by = 'orig.ident', reduction = 'iNMF_raw', reduction.name = 'iNMF', reduction.key = 'iNMF_', quantiles = 50, ref_dataset = NULL, min_cells = 20, knn_k = 20, dims.use = NULL, do.center = FALSE, max_sample = 1000, eps = 0.9, refine.knn = TRUE, ... ) { CheckPackage(package = 'rliger', repository = 'cran') embeddings <- sapply( X = SplitObject(object = object, split.by = split.by), FUN = function(x) { return(Embeddings(object = x[[reduction]])) }, simplify = FALSE, USE.NAMES = TRUE ) if (is.null(x = ref_dataset)) { num.samples <- vapply( X = embeddings, FUN = nrow, FUN.VALUE = integer(length = 1L) ) ref_dataset <- names(x = embeddings)[which.max(x = num.samples)] } else if (is.numeric(x = ref_dataset)) { ref_dataset <- names(x = embeddings)[ref_dataset] } if (is.character(x = ref_dataset) && !ref_dataset %in% names(x = embeddings)) { stop("Cannot find reference dataset '", ref_dataset, "' in the split", call. = FALSE) } out <- rliger::quantile_norm( object = embeddings, quantiles = quantiles, ref_dataset = ref_dataset, min_cells = min_cells, knn_k = knn_k, dims.use = dims.use, do.center = do.center, max_sample = max_sample, eps = eps, refine.knn = refine.knn, ... ) object[[reduction.name]] <- CreateDimReducObject( embeddings = out$H.norm, assay = DefaultAssay(object = object[[reduction]]), key = reduction.key ) out <- as.data.frame(x = out[names(x = out) != 'H.norm']) object[[colnames(x = out)]] <- out Idents(object = object) <- 'clusters' object <- LogSeuratCommand(object = object) return(object) } ================================================ FILE: R/miqc.R ================================================ #' @include internal.R #' NULL #' Run miQC on a Seurat object #' #' @param object Seurat object #' @param percent.mt (character) Name of the column in the Seurat metadata that #' contains the percent of reads attributed to mitochondrial genes. #' Defaults to "percent.mt". #' @param nFeature_RNA (character) Name of the column in the Seurat metadata that #' contains the number of reads per cell. Defaults to "nFeature_RNA". #' @param posterior.cutoff numeric) The posterior probability of a cell being #' part of the compromised distribution, a number between 0 and 1. Any cells #' below the appointed cutoff will be marked to keep. #' Defaults to 0.75. #' @param model.type (character) What type of model to generate. A linear #' mixture model ("linear") is recommended, but currently b-spline ("spline") #' and two-degree polynomial ("polynomial") are also supported #' Default = "linear". #' @param backup.option (character) In case flexmix fails to build a 2 cluster #' mixture model, what should RunMiQC do: "percent" (set miQC.keep values #' according to backup.percent), "percentile" (set miQC.keep values according #' to backup.percentile), "pass" (return original Seurat object), or "halt" #' (stop RunMiQC). "percent", "percentile", and "pass" are useful when #' processing multiple Seurat objects sequentially. #' @param backup.percentile (numeric) What percentile to use as cutoff in case #' flexmix fails to build a 2 cluster mixture model. Will only be used if #' backup.option is "percentile". #' @param backup.percent (numeric) What percent to use as cutoff in case flexmix #' fails to build a 2 cluster mixture model. Will only be used if #' backup.option is "percent". #' @param verbose Boolean. TRUE to show progress messages, FALSE to hide progress messages #' @details (Copied verbatim from miQC) _Function to fit a two-distribution mixture model on a Seurat object and find those cells probabistically determined to be compromised by the mixture model._ #' #' @return Returns a Seurat object with probabilities and "keep" decisions stored as "miQC.probability" and "miQC.keep" in the object metadata, respectively. #' @references Hippen et al. (2021) miQC: An adaptive probabilistic framework for quality control of single-cell RNA-sequencing data. bioRxiv doi: 10.1101/2021.03.03.433798 #' #' @importFrom rlang %||% #' #' @export RunMiQC <- function( object, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", posterior.cutoff = 0.75, model.type = "linear", model.slot = "flexmix_model", verbose = TRUE, backup.option = "percentile", backup.percentile = 0.99, backup.percent = 5, ... ) { SeuratWrappers:::CheckPackage(package = 'flexmix', repository = "CRAN") my_data <- Seurat::FetchData(object, vars = c(percent.mt, nFeature_RNA)) colnames(my_data) <- c("percent.mt", "nFeature_RNA") if(!(model.type %in% c("linear", "spline", "polynomial"))){ stop("model.type must be one of \"linear\", \"spline\", or \"polynomial\"") } #implementing tryCatch because of internal flexmix error when model fitting #fails. see https://github.com/satijalab/seurat-wrappers/issues/108 my_model <- tryCatch({ if (model.type == "linear") { my_model <- flexmix::flexmix(percent.mt~nFeature_RNA, data = my_data, k = 2) } else if (model.type == "spline") { my_model <- flexmix::flexmix(percent.mt~splines::bs(nFeature_RNA), data = my_data, k = 2) } else if (model.type == "polynomial") { my_model <- flexmix::flexmix(percent.mt~poly(nFeature_RNA, degree = 2), data = my_data, k = 2) } }, error=function(e){ cat("flexmix fitting error:", conditionMessage(e),"\n") my_model <- NULL}) # #set a variable = model_status to denote the status of the model fitting #1 = model fit successfully #2 = model fit only 1 cluster #3 = model fails at flexmix stage # if(is.null(my_model)){ model_status <- 3 model_message <- "flexmix internal failure" } else if (ncol(flexmix::parameters(my_model)) == 1){ model_status <- 2 model_message <- "flexmix returned only 1 cluster" } else if (ncol(flexmix::parameters(my_model)) == 2){ model_status <- 1 model_message <- "flexmix model fit successfully" } else { stop("model_status error, please post issue on GitHub") } if(model_status %in% c(2,3)){ if(model_status == 2){ warning(model_message) } if(model_status == 3){ warning(model_message) } if(backup.option == "halt"){ stop("Halting.")} else if (backup.option == "pass") { message("returning object without miQC model or stats") return(object)} else if (backup.option == "percentile") { message("defaulting to backup.percentile for filtering") compromised_probability <- 0 raw_values <- my_data[,percent.mt] percentile_cutoff <- quantile(raw_values, probs = backup.percentile) cells_to_keep <- ifelse(raw_values <= percentile_cutoff, "keep", "discard")} else if (backup.option == "percent"){ message("defaulting to backup.percent for filtering") compromised_probability <- 0 raw_values <- my_data[,percent.mt] cells_to_keep <- ifelse(raw_values <= backup.percent, "keep", "discard")} else { stop("backup.option must be one of \"percentile\", \"percent\", \"halt\", or \"pass\"") } } else if (model_status == 1){ Misc(object, model.slot) <- my_model my_model_parameters <- flexmix::parameters(my_model) my_model_posterior <- flexmix::posterior(my_model) intercept1 <- my_model_parameters[,1][1] intercept2 <- my_model_parameters[,2][1] if (intercept1 > intercept2) { compromised_dist <- 1 } else { compromised_dist <- 2 } compromised_probability <- my_model_posterior[,compromised_dist] cells_to_keep <- ifelse(compromised_probability <= posterior.cutoff, "keep", "discard") } else { stop("model_status error, please post issue on GitHub") } object <- Seurat::AddMetaData(object = object, metadata = compromised_probability, col.name = "miQC.probability") object <- Seurat::AddMetaData(object = object, metadata = cells_to_keep, col.name = "miQC.keep") object <- Seurat::LogSeuratCommand(object) return(object) } #' Run miQC on a Seurat object #' #' @param object Seurat object #' @details _Function to plot the miQC mixture model stored in a Seurat object. `RunMiQC` must be run prior to plotting._ #' @references Hippen et al. (2021) miQC: An adaptive probabilistic framework for quality control of single-cell RNA-sequencing data. bioRxiv doi: 10.1101/2021.03.03.433798 #' #' @importFrom rlang %||% #' #' @export PlotMiQC <- function(seurat_object, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", model.slot = "flexmix_model", color.by = "miQC.probability") { features_to_fetch <- c(percent.mt, nFeature_RNA, "miQC.probability", "miQC.keep", color.by) features_to_fetch <- unique(features_to_fetch) my_data <- Seurat::FetchData(seurat_object, vars = features_to_fetch) colnames(my_data)[1:2] <- c("percent.mt", "nFeature_RNA") my_model <- Misc(seurat_object, model.slot) my_model_parameters <- cbind(my_data, flexmix::fitted(my_model)) #<<< code from plotModel in miQC package >>> intercept1 <- flexmix::parameters(my_model, component = 1)[1] intercept2 <- flexmix::parameters(my_model, component = 2)[1] if (intercept1 > intercept2) { compromised_dist <- 1 } else { compromised_dist <- 2 } ggplot2::ggplot(my_data, ggplot2::aes(x = nFeature_RNA, y = percent.mt, colour = !!ggplot2::sym(color.by))) + ggplot2::labs(x = "Unique genes found", y = "Percent reads mitochondrial", color = color.by) + ggplot2::geom_point() + ggplot2::geom_line(data = my_model_parameters, inherit.aes = FALSE, ggplot2::aes(x = nFeature_RNA, y = Comp.1), lwd = 2) + ggplot2::geom_line(data = my_model_parameters, inherit.aes = FALSE, ggplot2::aes(x = nFeature_RNA, y = Comp.2), lwd = 2) + ggplot2::ylim(c(0, NA))+ cowplot::theme_cowplot() } ================================================ FILE: R/monocle3.R ================================================ #' @include internal.R #' @importFrom Seurat DefaultAssay Idents<- #' @importFrom methods as slot<- slot #' NULL clusters.key <- 'monocle3_clusters' partitions.key <- 'monocle3_partitions' #' Convert objects to Monocle3 \code{cell_data_set} objects #' #' @param x An object #' @param ... Arguments passed to other methods #' #' @return A \code{cell_data_set} object #' #' @name as.cell_data_set #' @rdname as.cell_data_set #' #' @aliases as.CellDataSet #' #' @export #' as.cell_data_set <- function(x, ...) { CheckPackage(package = 'cole-trapnell-lab/monocle3', repository = 'github') UseMethod(generic = 'as.cell_data_set', object = x) } #' @inheritParams Seurat::as.SingleCellExperiment #' @param reductions A vector of dimensional reductions add to the #' \code{cell_data_set} object; defaults to all dimensional reductions #' calculated from \code{assay} and all \link[Seurat:IsGlobal]{global} #' dimensional reductions #' @param default.reduction Name of dimensional reduction to use for clustering #' name #' @param graph Name of graph to be used for clustering results #' @param group.by Name of cell-level metadata column to use as identites; pass # \code{NULL} to use the active identites #' #' @importFrom Seurat as.SingleCellExperiment GetAssayData Loadings #' Embeddings Stdev Idents #' #' @details The \code{\link[Seurat]{Seurat}} method utilizes #' \code{\link[Seurat]{as.SingleCellExperiment}} to transfer over expression #' and cell-level metadata. The following additional information is also #' transferred over: #' \itemize{ #' \item Cell emebeddings are transferred over to the #' \code{\link[SingleCellExperiment]{reducedDims}} slot. Dimensional reduction #' names are converted to upper-case (eg. \dQuote{umap} to \dQuote{UMAP}) to #' match Monocle 3 style #' \item Feature loadings are transfered to #' \code{cds@reduce_dim_aux$gene_loadings} if present. \strong{NOTE}: only the #' feature loadings of the last dimensional reduction are transferred over #' \item Standard deviations are added to #' \code{cds@reduce_dim_aux$prop_var_expl} if present. \strong{NOTE}: only the #' standard deviations of the last dimensional reduction are transferred over #' \item Clustering information is transferred over in the following manner: if #' cell-level metadata entries \dQuote{monocle3_clusters} and #' \dQuote{monocle3_partitions} exist, then these will be set as the clusters #' and partitions, with no nearest neighbor graph being added to the object; #' otherwise, Seurat's nearest-neighbor graph will be converted to an #' \code{\link[igraph]{igraph}} object and added to the \code{cell_data_set} #' object along with Seurat's clusters. No partition information is added when #' using Seurat's clsuters #' } #' #' @seealso \code{\link[Seurat]{as.SingleCellExperiment}} #' #' @rdname as.cell_data_set #' @method as.cell_data_set Seurat #' @export #' as.cell_data_set.Seurat <- function( x, assay = DefaultAssay(object = x), reductions = AssociatedDimReducs(object = x, assay = assay), default.reduction = DefaultDimReduc(object = x, assay = assay), graph = paste0(assay, '_snn'), group.by = NULL, ... ) { # Add assay data # Cheat and use as.SingleCellExperiment cds <- as( object = as.SingleCellExperiment(x = x, assay = assay), Class = 'cell_data_set' ) # Ensure we have a counts assay if (is.null(x = SummarizedExperiment::assays(x = cds)$counts)) { SummarizedExperiment::assays(x = cds)$counts <- SummarizedExperiment::assays(x = cds)[[1]] } # Add Size_factor if (!"Size_Factor" %in% colnames(x = SummarizedExperiment::colData(x = cds))) { size.factor <- paste0('nCount_', assay) if (size.factor %in% colnames(x = x[[]])) { SummarizedExperiment::colData(x = cds)$Size_Factor <- x[[size.factor, drop = TRUE]] } } # Add DimReducs: Embeddings become a reduced dim, Loadings go to # reduce_dim_aux$gene_loadings, Stdev goes go reduce_dim_aux$prop_var_expl # First, reset the ones from as.SingleCellExperiment SingleCellExperiment::reducedDims(x = cds)[SingleCellExperiment::reducedDimNames(x = cds)] <- NULL reductions <- intersect( x = reductions, y = AssociatedDimReducs(object = x, assay = assay) ) for (reduc in reductions) { SingleCellExperiment::reducedDims(x = cds)[[toupper(x = reduc)]] <- Embeddings(object = x[[reduc]]) loadings <- Loadings(object = x[[reduc]]) if (!IsMatrixEmpty(x = loadings)) { slot(object = cds, name = 'reduce_dim_aux')[['gene_loadings']] <- loadings } stdev <- Stdev(object = x[[reduc]]) if (length(x = stdev)) { slot(object = cds, name = 'reduce_dim_aux')[['prop_var_expl']] <- stdev } } # Add clustering information # TODO: Figure out if I need to add relations, distMatrix, or clusters/partitions if (!is.null(x = group.by)) { Idents(object = x) <- group.by } # if (clusters.key %in% colnames(x = x[[]])) { clusters.list <- if (is.null(x = group.by) && all(c(clusters.key, partitions.key) %in% colnames(x = x[[]]))) { message("Using existing Monocle 3 cluster membership and partitions") list( partitions = factor(x = x[[partitions.key, drop = TRUE]]), clusters = factor(x = x[[clusters.key, drop = TRUE]]) ) } else if (graph %in% names(x = x)) { g <- igraph::graph_from_adjacency_matrix( adjmatrix = x[[graph]], weighted = TRUE ) # TODO: figure out proper partitioning scheme # partitions <- igraph::components(graph = g)$membership[colnames(x = x)] warning( "Monocle 3 trajectories require cluster partitions, which Seurat does not calculate. Please run 'cluster_cells' on your cell_data_set object", call. = FALSE, immediate. = TRUE ) partitions <- rep_len(x = 1, length.out = ncol(x = x)) list( cluster_result = list( g = g, relations = NULL, distMatrix = 'matrix', coord = NULL, edge_links = NULL, optim_res = list( membership = as.integer(x = Idents(object = x)), modularity = NA_real_ ) ), partitions = factor(x = partitions), clusters = Idents(object = x) ) } else { list() } if (length(x = clusters.list)) { slot(object = cds, name = 'clusters')[[toupper(x = default.reduction)]] <- clusters.list } # TODO: Add translated results from learn_graph return(cds) } #' @param loadings Name of dimensional reduction to save loadings to, if present; #' defaults to first dimensional reduction present (eg. #' \code{SingleCellExperiment::reducedDimNames(x)[1]}); pass \code{NA} to #' suppress transfer of loadings #' @param clusters Name of clustering method to use for setting identity classes #' #' @importFrom Seurat as.Seurat Loadings<- as.Graph DefaultAssay<- #' #' @details The \code{cell_data_set} method for \code{\link[Seurat]{as.Seurat}} #' utilizes the \code{\link[Seurat::as.Seurat]{SingleCellExperiment}} method of #' \code{\link[Seurat]{as.Seurat}} to handle moving over expression data, cell #' embeddings, and cell-level metadata. The following additional information #' will also be transfered over: #' \itemize{ #' \item Feature loadings from \code{cds@reduce_dim_aux$gene_loadings} will be #' added to the dimensional reduction specified by \code{loadings} or the name #' of the first dimensional reduction that contains "pca" (case-insensitive) if #' \code{loadings} is not set #' \item Monocle 3 clustering will be set as the default identity class. In #' addition, the Monocle 3 clustering will be added to cell-level metadata as #' \dQuote{monocle3_clusters}, if present #' \item Monocle 3 partitions will be added to cell-level metadata as #' \dQuote{monocle3_partitions}, if present #' \item Monocle 3 pseudotime calculations will be added to #' \dQuote{monocle3_pseudotime}, if present #' \item The nearest-neighbor graph, if present, will be converted to a #' \code{\link[Seurat]{Graph}} object, and stored as #' \dQuote{\code{assay}_monocle3_graph} #' } #' #' @seealso \code{\link[Seurat]{as.Seurat.SingleCellExperiment}} #' #' @rdname as.Seurat.extras #' @method as.Seurat cell_data_set #' @export #' as.Seurat.cell_data_set <- function( x, counts = 'counts', data = NULL, assay = 'RNA', project = 'cell_data_set', loadings = NULL, clusters = NULL, ... ) { CheckPackage(package = 'cole-trapnell-lab/monocle3', repository = 'github') # Cheat and pull most information using as.SingleCellExperiment # cell_data_set objects inherit SingleCellExperiment object <- suppressWarnings(expr = as.Seurat( x = as(object = x, Class = 'SingleCellExperiment'), assay = assay, counts = counts, data = data, project = project )) # Pull feature loadings lds.reduc <- ifelse( test = is.null(x = loadings), yes = grep( pattern = 'pca', x = SingleCellExperiment::reducedDimNames(x = x), ignore.case = TRUE, value = TRUE ), no = loadings ) if (length(x = lds.reduc) && !is.na(x = lds.reduc)) { loadings <- slot(object = x, name = 'reduce_dim_aux')[['gene_loadings']] if (!is.null(x = loadings)) { Loadings(object = object[[lds.reduc]], projected = FALSE) <- loadings } } # Pull cluster information and pseudotime if (length(x = slot(object = x, name = 'clusters'))) { clusters <- clusters %||% DefaultDimReduc(object = object) object[[clusters.key]] <- Idents(object = object) <- monocle3::clusters( x = x, reduction_method = clusters ) object[[partitions.key]] <- monocle3::partitions( x = x, reduction_method = clusters ) graph <- slot(object = x, name = 'clusters')[[clusters]]$cluster_result$g[] try( expr = { graph <- as.Graph(x = graph) DefaultAssay(object = graph) <- DefaultAssay(object = object) object[[paste0(DefaultAssay(object = graph), '_monocle3_graph')]] <- graph }, silent = TRUE ) try( expr = object[['monocle3_pseudotime']] <- monocle3::pseudotime( x = cds, reduction_method = clusters ), silent = TRUE ) } # TODO: Pull trajectory information return(object) } #' Run \code{link[monocle3]{learn_graph}} on a \code{\link[Seurat]{Seurat}} object #' #' @param object A \code{\link[Seurat]{Seurat}} object #' @param reduction Name of reduction to use for learning the pseudotime graph #' @param ... Arguments passed to \code{\link[monocle3]{learn_graph}} #' #' @return A \code{\link[monocle3]{cell_data_set}} object with the pseudotime graph #' #' @importFrom Seurat Reductions #' #' @seealso \code{\link[monocle3]{learn_graph}} \code{\link[monocle3]{cell_data_set}} #' # @export #' LearnGraph <- function(object, reduction = DefaultDimReduc(object = object), ...) { CheckPackage(package = 'cole-trapnell-lab/monocle3', repository = 'github') if (reduction != 'UMAP') { if ('UMAP' %in% Reductions(object = object)) { '' } reduc <- object[[reduction]] suppressWarnings(expr = object[['UMAP']] <- reduc) } cds <- as.cell_data_set( x = object, assay = DefaultAssay(object = object[['UMAP']]), reductions = 'UMAP', default.reduction = 'UMAP' ) cds <- monocle3::learn_graph(cds = cds, ...) return(cds) # if (reduction != 'UMAP') { # object[['UMAP']] <- NULL # } # return(object) } ================================================ FILE: R/pacmap.R ================================================ #' Run PaCMAP (Pairwise Controlled Manifold Approximation) #' #' Runs PaCMAP, a method for dimensionality reduction for scRNA-seq data. #' data. Constructs three kinds of pairs of points: neighbor pairs (pair_neighbors), #' mid-near pair (pair_MN), and further pairs (pair_FP) based on positional relationship #' in the original space, and optimize a low-dimensional embedding accordingly. #' Described in Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). "Understanding #' how dimension reduction tools work: an empirical approach to deciphering t-SNE, UMAP, #' TriMAP, and PaCMAP for data visualization." Journal of Machine Learning Research, #' 22(201), 1-73. #' This implementation is based on the work of Hao Zhang, as found in #' https://github.com/zhanghao-njmu/SCP/. We made modifications to ensure compatibility #' across multiple platforms, including Windows and macOS. #' #' @param object An object. This can be a Seurat object or a matrix-like object. #' #' @author Yiyang Sun, Haiyang Huang, Gaurav Rajesh Parikh #' @references Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). "Understanding #' how dimension reduction tools work: an empirical approach to deciphering t-SNE, UMAP, #' TriMAP, and PaCMAP for data visualization." Journal of Machine Learning Research, #' 22(201), 1-73. #' #' @examples #' pancreas_sub <- Seurat::FindVariableFeatures(pancreas_sub) #' pancreas_sub <- RunPaCMAP(object = pancreas_sub, features = Seurat::VariableFeatures(pancreas_sub)) #' DimPlot(pancreas_sub, reduction = "pacmap") #' #' @rdname RunPaCMAP #' @export RunPaCMAP <- function(object, ...) { if (inherits(object, "Seurat")) { RunPaCMAP.Seurat(object, ...) } else { RunPaCMAP.default(object, ...) } } #' @rdname RunPaCMAP #' @method RunPaCMAP Seurat #' @param object An object. This can be a Seurat object or a matrix-like object. #' @param reduction A character string specifying the reduction to be used as input. Default is "pca". #' @param dims An integer vector specifying the dimensions to be used. Default is NULL. #' @param features A character vector specifying the features to be used. Default is NULL. #' @param assay A character string specifying the assay to be used. Default is NULL. #' @param slot A character string specifying the slot name to be used. Default is "data". #' @param reduction.name A character string specifying the name of the reduction to be stored in the Seurat object. Default is "pacmap". #' @param reduction.key A character string specifying the prefix for the column names of the PaCMAP embeddings. Default is "PaCMAP_". #' #' @importFrom Seurat LogSeuratCommand #' @export RunPaCMAP.Seurat <- function(object, reduction = "pca", dims = NULL, features = NULL, assay = NULL, layer = "data", n_components = 2, n.neighbors = NULL, MN_ratio = 0.5, FP_ratio = 2, distance_method = "euclidean", lr = 1, num_iters = 250L, apply_pca = TRUE, init = "random", reduction.name = "pacmap", reduction.key = "PaCMAP_", verbose = TRUE, seed.use = 11L, ...) { if (is.null(dims) && is.null(features)) { stop("Please specify only one of `dims` or `features`.") } if (!is.null(x = features)) { assay <- assay %||% DefaultAssay(object = object) data.use <- t(as.matrix(x = GetAssayData(object = object, layer=layer, assay = assay)[features, , drop = FALSE])) if (ncol(x = data.use) < n_components) { stop( "Please provide as many or more features than n_components: ", length(x = features), " features provided, ", n_components, " PaCMAP components requested", call. = FALSE ) } } else if (!is.null(x = dims)) { if (!is.null(x = assay) && assay != DefaultAssay(object = object[[reduction]])){ warning("If both `assay` and `dims` are specified, the value of `assay` will get ignored.") } data.use <- Embeddings(object[[reduction]])[, dims] assay <- DefaultAssay(object = object[[reduction]]) if (length(x = dims) < n_components) { stop( "Please provide as many or more dims than n_components: ", length(x = dims), " dims provided, ", n_components, " PaCMAP components requested", call. = FALSE ) } } else { stop("Please specify one of dims or features") } object[[reduction.name]] <- RunPaCMAP( object = data.use, assay = assay, n_components = n_components, n.neighbors = n.neighbors, MN_ratio = MN_ratio, FP_ratio = FP_ratio, distance_method = distance_method, lr = lr, num_iters = num_iters, apply_pca = apply_pca, init = init, reduction.key = reduction.key, verbose = verbose, seed.use = seed.use ) object <- LogSeuratCommand(object = object) return(object) } #' @rdname RunPaCMAP #' @method RunPaCMAP default #' @importFrom Seurat CreateDimReducObject #' @importFrom reticulate import #' @param object An object. This can be a Seurat object or a matrix-like object. #' @param reduction A character string specifying the reduction to be used as input. Default is "pca". #' @param dims An integer vector specifying the dimensions to be used. Default is NULL. #' @param features A character vector specifying the features to be used. Default is NULL. #' @param assay A character string specifying the assay to be used. Default is NULL. #' @param layer A character string specifying the layer name to be used. Default is "data". #' @param n_components An integer specifying the number of PaCMAP components. Default is 2. #' @param n.neighbors An integer specifying the number of neighbors considered in the k-Nearest Neighbor graph. Default to 10 for dataset whose sample size is smaller than 10000. For large dataset whose sample size (n) is larger than 10000, the default value is: 10 + 15 * (log10(n) - 4). #' @param MN_ratio A numeric value specifying the ratio of the ratio of the number of mid-near pairs to the number of neighbors. Default is 0.5. #' @param FP_ratio A numeric value specifying the ratio of the ratio of the number of further pairs to the number of neighbors. Default is 2. #' @param distance_method A character string specifying the distance metric to be used. Default is "euclidean". #' @param lr A numeric value specifying the learning rate of the AdaGrad optimizer. Default is 1. #' @param num_iters An integer specifying the number of iterations for PaCMAP optimization. Default is 450. #' @param apply_pca A logical value indicating whether pacmap should apply PCA to the data before constructing the k-Nearest Neighbor graph. Using PCA to preprocess the data can largely accelerate the DR process without losing too much accuracy. Notice that this option does not affect the initialization of the optimization process. Default is TRUE. #' @param init A character string specifying the initialization of the lower dimensional embedding. One of "pca" or "random". Default is "random". #' @param reduction.name A character string specifying the name of the reduction to be stored in the Seurat object. Default is "pacmap". #' @param reduction.key A character string specifying the prefix for the column names of the PaCMAP embeddings. Default is "PaCMAP_". #' @param verbose A logical value indicating whether to print verbose output. Default is TRUE. #' @param seed.use An integer specifying the random seed to be used. Default is 11. #' @param ... Additional arguments to be passed to the pacmap.PaCMAP function. #' @export RunPaCMAP.default <- function(object, assay = NULL, n_components = 2, n.neighbors = NULL, MN_ratio = 0.5, FP_ratio = 2, distance_method = "euclidean", lr = 1, num_iters = 250L, apply_pca = TRUE, init = "random", reduction.key = "PaCMAP_", verbose = TRUE, seed.use = 11L, ...) { if (!is.null(x = seed.use)) { set.seed(seed = seed.use) } if (!py_module_available(module = 'pacmap')) { stop("Cannot find PaCMAP, please install through conda (e.g. conda install conda-forge::pacmap).") } pacmap <- reticulate::import("pacmap") operator <- pacmap$PaCMAP( n_components = as.integer(n_components), n_neighbors = n.neighbors, MN_ratio = MN_ratio, FP_ratio = FP_ratio, distance = distance_method, lr = lr, num_iters = num_iters, apply_pca = apply_pca, verbose = verbose, random_state = as.integer(seed.use) ) embedding <- operator$fit_transform(object, init = init) colnames(x = embedding) <- paste0(reduction.key, seq_len(ncol(x = embedding))) rownames(x = embedding) <- rownames(object) reduction <- CreateDimReducObject( embeddings = embedding, key = reduction.key, assay = assay, global = TRUE ) return(reduction) } ================================================ FILE: R/presto.R ================================================ #' @include internal.R #' NULL # Runs Wilcoxon Rank Sum using the Presto package # # @param data.use Data matrix to test # @param cells.1 Group 1 cells # @param cells.2 Group 2 cells # @param verbose Print a progress bar # @param ... Extra parameters passed to wilcox.test # # @return Returns a p-value ranked matrix of putative differentially expressed # features # #' @importFrom stats wilcox.test # PrestoDETest <- function( data.use, cells.1, cells.2, verbose = TRUE, ... ) { data.use <- data.use[, c(cells.1, cells.2), drop = FALSE] # NOTE: do not use logfc from presto group.info <- factor( c(rep(x = "Group1", length = length(x = cells.1)), rep(x = "Group2", length = length(x = cells.2))), levels = c("Group1", "Group2")) names(x = group.info) <- c(cells.1, cells.2) data.use <- data.use[, names(x = group.info), drop = FALSE] res <- presto::wilcoxauc(X = data.use, y = group.info) res <- res[1:(nrow(x = res)/2), c('pval','auc')] colnames(x = res)[1] <- 'p_val' return(as.data.frame(x = res, row.names = rownames(x = data.use))) } #' A Presto-based implementation of FindMarkers that runs Wilcoxon tests for the given identity classes #' #' @param ident.1 Identity class to define markers for; pass an object of class #' \code{phylo} or 'clustertree' to find markers for a node in a cluster tree; #' passing 'clustertree' requires \code{\link{BuildClusterTree}} to have been run #' @param ident.2 A second identity class for comparison; if \code{NULL}, #' use all other cells for comparison; if an object of class \code{phylo} or #' 'clustertree' is passed to \code{ident.1}, must pass a node to find markers for #' @param reduction Reduction to use in differential expression testing - will test for DE on cell embeddings #' @param group.by Regroup cells into a different identity class prior to performing differential expression (see example) #' @param subset.ident Subset a particular identity class prior to regrouping. Only relevant if group.by is set (see example) #' @param assay Assay to use in differential expression testing #' @param slot Slot to pull data from; note that if \code{test.use} is "negbinom", "poisson", or "DESeq2", #' \code{slot} will be set to "counts" #' @param mean.fxn Function to use for fold change or average difference calculation. #' If NULL, the appropriate function will be chose according to the slot used #' @param fc.name Name of the fold change, average difference, or custom function column #' in the output data.frame. If NULL, the fold change column will be named #' according to the logarithm base (eg, "avg_log2FC"), or if using the scale.data #' slot "avg_diff". #' @param base The base with respect to which logarithms are computed. #' #' @importFrom rlang duplicate #' @importFrom utils assignInNamespace #' @importFrom Seurat FindMarkers #' #' @export #' @seealso https://github.com/immunogenomics/presto RunPresto <- function( object, ident.1 = NULL, ident.2 = NULL, group.by = NULL, subset.ident = NULL, assay = NULL, slot = 'data', reduction = NULL, features = NULL, logfc.threshold = 0.25, test.use = "wilcox", min.pct = 0.1, min.diff.pct = -Inf, verbose = TRUE, only.pos = FALSE, max.cells.per.ident = Inf, random.seed = 1, latent.vars = NULL, min.cells.feature = 3, min.cells.group = 3, mean.fxn = NULL, fc.name = NULL, base = 2, ... ) { if (test.use != 'wilcox') { stop("Differential expression test must be `wilcox`") } CheckPackage(package = 'immunogenomics/presto', repository = 'github') orig.fxn <- rlang::duplicate(x = Seurat:::WilcoxDETest) assignInNamespace( x = "WilcoxDETest", value = PrestoDETest, ns = "Seurat") tryCatch( expr = res <- FindMarkers( object = object, ident.1 = ident.1, ident.2 = ident.2, group.by = group.by, subset.ident = subset.ident, assay = assay, slot = slot, reduction = reduction, features = features, logfc.threshold = logfc.threshold, test.use = "wilcox", min.pct = min.pct, min.diff.pct = min.diff.pct, verbose = verbose, only.pos = only.pos, max.cells.per.ident = max.cells.per.ident, random.seed = random.seed, latent.vars = latent.vars, min.cells.feature = min.cells.feature, min.cells.group = min.cells.group, mean.fxn = mean.fxn, fc.name = fc.name, base = base, ... ), finally = assignInNamespace( x = "WilcoxDETest", value = orig.fxn, ns = "Seurat") ) return(res) } #' A Presto-based implementation of FindAllMarkers that runs Wilcoxon tests for all identity classes #' #' Finds markers (Wilcoxon-differentially expressed genes) for each of the identity classes in a dataset #' #' @inheritParams RunPresto #' @param node A node to find markers for and all its children; requires #' \code{\link{BuildClusterTree}} to have been run previously; replaces \code{FindAllMarkersNode} #' @param return.thresh Only return markers that have a p-value < return.thresh, or a power > return.thresh (if the test is ROC) #' #' @return Matrix containing a ranked list of putative markers, and associated #' statistics (p-values, logFC, etc.) #' #' @importFrom stats setNames #' @importFrom rlang duplicate #' @importFrom utils assignInNamespace #' @importFrom Seurat FindAllMarkers #' #' @export #' #' @aliases RunPrestoAllNode #' @seealso https://github.com/immunogenomics/presto RunPrestoAll <- function( object, assay = NULL, features = NULL, logfc.threshold = 0.25, test.use = 'wilcox', slot = 'data', min.pct = 0.1, min.diff.pct = -Inf, node = NULL, verbose = TRUE, only.pos = FALSE, max.cells.per.ident = Inf, random.seed = 1, latent.vars = NULL, min.cells.feature = 3, min.cells.group = 3, mean.fxn = NULL, fc.name = NULL, base = 2, return.thresh = 1e-2, ... ) { if (test.use != 'wilcox') { stop("Differential expression test must be `wilcox`") } CheckPackage(package = 'immunogenomics/presto', repository = 'github') orig.fxn <- rlang::duplicate(x = Seurat:::WilcoxDETest) assignInNamespace( x = "WilcoxDETest", value = PrestoDETest, ns = "Seurat") tryCatch( expr = res <- FindAllMarkers( object = object, assay = assay, features = features, logfc.threshold = logfc.threshold, test.use = "wilcox", slot = slot, min.pct = min.pct, min.diff.pct = min.diff.pct, node = node, verbose = verbose, only.pos = only.pos, max.cells.per.ident = max.cells.per.ident, random.seed = random.seed, latent.vars = latent.vars, min.cells.feature = min.cells.feature, min.cells.group = min.cells.group, mean.fxn = mean.fxn, fc.name = fc.name, base = base, return.thresh = return.thresh, ... ), finally = assignInNamespace( x = "WilcoxDETest", value = orig.fxn, ns = "Seurat") ) return(res) } ================================================ FILE: R/scVI.R ================================================ #' @include internal.R #' NULL #' scVI Integration #' @param object A \code{StdAssay} or \code{STDAssay} instance containing #' merged data #' @param features Features to integrate #' @param layers Layers to integrate #' @param conda_env conda environment to run scVI #' @param new.reduction Name under which to store resulting DimReduc object #' @param ndims Dimensionality of the latent space #' @param nlayers Number of hidden layers used for encoder and decoder NNs #' @param gene_likelihood Distribution to use for modelling expression #' data: {"zinb", "nb", "poisson"} #' @param max_epochs Number of passes through the dataset taken while #' training the model #' @param ... Unused - currently just capturing parameters passed in from #' \code{Seurat::IntegrateLayers} intended for other integration methods #' #' @export #' #' @note This function requires the #' \href{https://docs.scvi-tools.org/en/stable/installation.html}{\pkg{scvi-tools}} #' package to be installed #' #' @examples #' \dontrun{ #' # Preprocessing #' obj <- SeuratData::LoadData("pbmcsca") #' obj[["RNA"]] <- split(obj[["RNA"]], f = obj$Method) #' obj <- NormalizeData(obj) #' obj <- FindVariableFeatures(obj) #' obj <- ScaleData(obj) #' obj <- RunPCA(obj) #' #' # After preprocessing, we integrate layers, specifying a conda environment #' obj <- IntegrateLayers( #' object = obj, #' method = scVIIntegration, #' new.reduction = "integrated.scvi", #' conda_env = "../miniconda3/envs/scvi-env", #' verbose = FALSE #' ) #' #' # Alternatively, we can integrate SCTransformed data #' obj <- SCTransform(object = obj) #' obj <- IntegrateLayers( #' object = obj, method = scVIIntegration, #' orig.reduction = "pca", new.reduction = "integrated.scvi", #' assay = "SCT", conda_env = "../miniconda3/envs/scvi-env", verbose = FALSE #' ) #' } #' #' @seealso \href{https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scvi_in_R.html}{scVI} #' #' @return A single-element named list \code{DimReduc} elements containing #' the integrated data scVIIntegration <- function( object, features = NULL, layers = "counts", conda_env = NULL, new.reduction = "integrated.dr", ndims = 30, nlayers = 2, gene_likelihood = "nb", max_epochs = NULL, ...) { # import python methods from specified conda env reticulate::use_condaenv(conda_env, required = TRUE) sc <- reticulate::import("scanpy", convert = FALSE) scvi <- reticulate::import("scvi", convert = FALSE) anndata <- reticulate::import("anndata", convert = FALSE) scipy <- reticulate::import("scipy", convert = FALSE) # if `max_epochs` is not set if (is.null(max_epochs)) { # convert `NULL` to python's `None` max_epochs <- reticulate::r_to_py(max_epochs) } else { # otherwise make sure it's an int max_epochs <- as.integer(max_epochs) } # build a meta.data-style data.frame indicating the batch for each cell # scVI expects a single counts matrix so we'll join our layers together # it also expects the raw counts matrix # TODO: avoid hardcoding this - users can rename their layers arbitrarily # so there's no gauruntee that the usual naming conventions will be followed if (inherits(object, what = "SCTAssay")) { batches <- .FindSCTBatches(object) } else { batches <- .FindBatches(object, layers = layers) object <- JoinLayers(object = object, layers = "counts") } # setup an `AnnData` python instance adata <- sc$AnnData( X = scipy$sparse$csr_matrix( # TODO: avoid hardcoding per comment above Matrix::t(LayerData(object, layer = "counts")[features, ]) ), obs = batches, var = object[[]][features, ] ) scvi$model$SCVI$setup_anndata(adata, batch_key = "batch") # initialize and train the model model <- scvi$model$SCVI( adata = adata, n_latent = as.integer(x = ndims), n_layers = as.integer(x = nlayers), gene_likelihood = gene_likelihood ) model$train(max_epochs = max_epochs) # extract the latent representation of the merged data latent <- model$get_latent_representation() latent <- as.matrix(latent) # pull the cell identifiers back out of the `AnnData` instance # in case anything was sorted under the hood rownames(latent) <- reticulate::py_to_r(adata$obs$index$values) # prepend the latent space dimensions with `new.reduction` to # give the features more readable names colnames(latent) <- paste0(new.reduction, "_", 1:ncol(latent)) # build a `DimReduc` instance suppressWarnings( latent.dr <- CreateDimReducObject( embeddings = latent, key = new.reduction ) ) # to make it easier to add the reduction into a `Seurat` instance # we'll wrap it up in a named list output.list <- list(latent.dr) names(output.list) <- new.reduction return(output.list) } attr(x = scVIIntegration, which = "Seurat.method") <- "integration" #' Builds a data.frame with batch identifiers to use when integrating #' \code{object}. For \code{StdAssays}, batches are split by layer. #' #' Internal - essentially the same as \code{Seurat:::CreateIntegrationGroups} #' except that it does not take in a `scale.layer` param. #' #' @noRd #' #' @param object A \code{StdAssay} instance. #' @param layers Layers in \code{object} to integrate. #' #' @return A dataframe indexed on the cell identifiers from \code{object} - #' the dataframe contains a single column, "batch", indicating the layer/batch each cell is from .FindBatches <- function(object, layers) { # build a LogMap indicating which layer each cell is from layer.masks <- slot(object, name = "cells")[, layers] # get a named vector mapping each cell to its respective layer layer.map <- labels( layer.masks, values = Cells(object, layer = layers) ) # wrap the vector up in a data.frame batch.df <- as.data.frame(layer.map) names(batch.df) <- "batch" return(batch.df) } #' Builds a data.frame with batch identifiers to use when integrating #' \code{object}. For \code{SCTAssay}s, batches are split using their #' model identifiers. #' #' Internal - essentially the same as \code{Seurat:::CreateIntegrationGroups} #' except that it does not take in a `scale.layer` param. #' #' @noRd #' #' @param object A \code{SCTAssay} or \code{StdAssays} instance. #' @param layers Layers in \code{object} to integrate. #' #' @return A dataframe indexed on the cell identifiers from \code{object} - #' the dataframe contains a single column, "batch", indicating the layer/batch each cell is from .FindSCTBatches <- function(object) { # build an empty data.frame indexed # on the cell identifiers from `object` batch.df <- SeuratObject::EmptyDF(n = ncol(object)) row.names(batch.df) <- Cells(object) # for each for (sct.model in levels(object)) { cell.identifiers <- Cells(object, layer = sct.model) batch.df[cell.identifiers, "batch"] <- sct.model } return(batch.df) } ================================================ FILE: R/tricycle.R ================================================ #' @include internal.R #' NULL #' Run estimate_cycle_position on a Seurat object #' #' This function run estimate_cycle_position function on Seurat object. It uses #' the tricycle internal reference projection matrix. #' #' @param object Seurat object #' @param assay Assay to use, defaults to the default assay #' @param slot Slot to use. It should be library size adjusted **log-expression** values. #' Note that it is convention that we rename "logcounts" to "data" when converting SingleCellExperiment to Seurat object. #' See also \code{\link[Seurat]{as.Seurat}}. Defaults to "data" #' @param reduction.name Name of the cell cycle projection returned #' @param reduction.key Key for the cell cycle projection returned #' @param gname Alternative rownames of \code{object}. If provided, this will be used to map genes within \code{object} with genes in reference. #' If not provided, the rownames of \code{object} will be used instead. Default: NULL #' @param gname.type The type of gene names as in \code{gname} or rownames of \code{object}. It can be either 'ENSEMBL' or 'SYMBOL'. Default: 'ENSEMBL' #' @param species The type of species in \code{object}. It can be either 'mouse' or 'human'. Default: 'mouse' #' @param AnnotationDb An AnnotationDb objects. If the user provides rownames in the format of Ensembl IDs and project human data, #' this object will be used to map Ensembl IDs to gene SYMBOLs. If no AnnotationDb object being given, the function will use \code{\link[org.Hs.eg.db]{org.Hs.eg.db}}. #' @param center.pc1 The center of PC1 when defining the angle. Default: 0 #' @param center.pc2 The center of PC2 when defining the angle. Default: 0 #' #' @export Runtricycle <- function( object, assay = NULL, slot = "data", reduction.name = "tricycleEmbedding", reduction.key = "tricycleEmbedding_", gname = NULL, gname.type = c("ENSEMBL", "SYMBOL"), species = c("mouse", "human"), AnnotationDb = NULL, center.pc1 = 0, center.pc2 = 0) { SeuratWrappers:::CheckPackage(package = 'tricycle', repository = 'bioconductor') assay <- assay %||% DefaultAssay(object = object) data.m <- GetAssayData(object = object, assay = assay, slot = slot) projection.m <- tricycle:::.project_cycle_space(data.m, ref.m = NULL, gname = gname, gname.type = gname.type, species = species, AnnotationDb = AnnotationDb) object$tricyclePosition <- tricycle:::.getTheta(projection.m, center.pc1 = center.pc1, center.pc2 = center.pc2) object[[reduction.name]] <- CreateDimReducObject( embeddings = projection.m, key = reduction.key, assay = assay ) return(object) } ================================================ FILE: R/velocity.R ================================================ #' @include internal.R #' NULL #' @inheritParams Seurat::CreateSeuratObject #' @param default.assay Name or index of matrix to use as default assay; #' defaults to name of first matrix in list #' @param slot Name of slot to store matrix in; choose from 'counts' or 'data' #' #' @importFrom methods new #' @importFrom utils txtProgressBar packageVersion setTxtProgressBar #' @importFrom Seurat as.Seurat CreateAssayObject Key<- CreateSeuratObject #' #' @details #' The \code{list} method for \code{\link[Seurat]{as.Seurat}} takes a named list #' of matrices (dense or sparse) and creates a single \code{Seurat} object where #' each matrix is its own assay. The names of the list are taken to be the names #' of the assays. If not present, assays will be named as "Assay#" where "#" is #' the index number in the list of matrices. Objects will be constructed as follows: #' \itemize{ #' \item By default, all matrices are assumed to be raw counts and will be stored #' in the \code{counts} slot. This can be changed to store in the matrix in the #' \code{data} slot instead. The \code{slot} parameter is vectorized, so different #' matrices can be stored in either \code{counts} or \code{data} #' \item For any and all matrices designated as \code{counts}, the \code{min.cells} #' and \code{min.features} filtering will be applied. These parameters are vectorized, #' so different filterings can be applied to different matrices #' \item No extra information (eg. \code{project}) can be provided to #' \code{\link[Seurat]{CreateSeuratObject}} #' } #' #' @rdname as.Seurat.extras #' @export #' @method as.Seurat list #' as.Seurat.list <- function( x, default.assay = 1, slot = 'counts', min.cells = 0, min.features = 0, verbose = TRUE, ... ) { if (!all(sapply(X = x, FUN = inherits, what = c('matrix', 'dgCMatrix')))) { stop("All values must be either a matrix or dgCMatrix", call. = FALSE) } names(x = x) <- names(x = x) %||% rep_len(x = '', length.out = length(x = x)) names(x = x)[nchar(x = names(x = x)) == 0] <- paste0('Assay', which(x = nchar(x = names(x = x)) == 0)) if (is.numeric(x = default.assay)) { default.assay <- names(x = x)[default.assay] } if (!default.assay %in% names(x = x)) { stop( "Cannot find specified default assay '", default.assay, "' in the list of matrices", call. = FALSE ) } slot <- rep_len(x = slot, length.out = length(x = x)) min.cells <- rep_len(x = min.cells, length.out = length(x = x)) min.features <- rep_len(x = min.features, length.out = length(x = x)) if (!all(slot %in% c('counts', 'data'))) { stop("'slot' must be either 'counts' or 'data'") } names(x = slot) <- names(x = min.cells) <- names(x = min.features) <- names(x = x) if (verbose) { pb <- txtProgressBar(min = 0, max = length(x = x), style = 3, file = stderr()) } if (slot[[default.assay]] == 'data') { assays <- list(CreateAssayObject(data = x[[default.assay]])) names(x = assays) <- default.assay suppressWarnings(expr = Key(object = assays[[default.assay]]) <- tolower(x = default.assay)) object <- new( Class = 'Seurat', assays = assays, meta.data = data.frame(row.names = colnames(x = assays[[default.assay]])), version = packageVersion(pkg = 'Seurat'), project.name = 'SeuratProject' ) DefaultAssay(object = object) <- default.assay } else { object <- CreateSeuratObject( counts = x[[default.assay]], assay = default.assay, min.cells = min.cells[[default.assay]], min.features = min.features[[default.assay]] ) } if (verbose) { setTxtProgressBar(pb = pb, value = 1 + pb$getVal()) } for (i in names(x = x)) { if (i == default.assay) { next } if (slot[[i]] == 'data') { suppressWarnings(expr = object[[i]] <- CreateAssayObject(data = x[[i]])) } else { suppressWarnings( expr = object[[i]] <- CreateAssayObject( counts = x[[i]], min.cells = min.cells[[i]], min.features = min.features[[i]] ) ) } if (verbose) { setTxtProgressBar(pb = pb, value = 1 + pb$getVal()) } } if (verbose) { close(con = pb) } return(object) } #' Load RNA Velocity data from a loom file #' #' This is a wrapper around \code{\link[velocyto.R]{read.loom.matrices}}, but sends #' messages to \code{stderr} instead of \code{stdout} (or silences messages with #' \code{verbose = FALSE}) #' #' @param file Path to loom file #' @param engine Method to load data data, choose from 'hdf5r' or 'h5' #' @param verbose Display progress updates #' #' @importFrom utils capture.output #' #' @export #' #' @seealso \code{\link[velocyto.R]{read.loom.matrices}} #' ReadVelocity <- function(file, engine = 'hdf5r', verbose = TRUE) { CheckPackage(package = 'velocyto-team/velocyto.R', repository = 'github') if (verbose) { sink(file = stderr(), type = 'output') on.exit(expr = sink()) ldat <- velocyto.R::read.loom.matrices(file = file, engine = engine) } else { invisible(x = capture.output(ldat <- velocyto.R::read.loom.matrices( file = file, engine = engine ))) } return(ldat) } #' Run RNA Velocty #' #' @param object A \code{Seurat} object #' @param spliced Name of spliced assay #' @param unspliced Name of unspliced assay #' @param ambiguous Optional name of ambiguous assay #' @param spliced.average,unspliced.average Required minimum average expression count for the spliced and unspliced expression matrices #' @param reduction Name of reduction to use #' @param group.by Factor to group cells by #' @param cells Vector of cells to use; defaults to all cells #' (see \code{\link[velocyto.R]{gene.relative.velocity.estimates}:steady.state.cells}) #' @param graph Optional name of nearest neighbor graph to use #' @param ncores Number of cores to use #' @param verbose Display progress updates #' @param ... Extra parameters passed to \code{\link[velocyto.R]{gene.relative.velocity.estimates}} #' #' @return ... #' #' @importFrom stats as.dist #' @importFrom Seurat FetchData GetAssayData #' #' @export #' #' @seealso \code{\link[velocyto.R]{gene.relative.velocity.estimates}} \code{\link[Seurat]{Tool}} #' RunVelocity <- function( object, spliced = 'spliced', unspliced = 'unspliced', ambiguous = NULL, spliced.average = 0.2, unspliced.average = 0.05, reduction = 'pca', group.by = 'ident', cells = NULL, graph = NULL, ncores = 1, verbose = TRUE, ... ) { CheckPackage(package = 'velocyto-team/velocyto.R', repository = 'github') # return(invisible(x = NULL)) # Collect data from Seurat object clusters <- FetchData(object = object, vars = group.by)[, , drop = TRUE] names(x = clusters) <- colnames(x = object) if (!is.factor(x = clusters)) { clusters <- as.factor(x = clusters) } if (verbose) { message("Filtering genes in the spliced matrix") } spliced.matrix <- velocyto.R::filter.genes.by.cluster.expression( emat = GetAssayData(object = object, assay = spliced), clusters = clusters, min.max.cluster.average = spliced.average ) if (verbose) { message("Filtering genes in the unspliced matrix") } unspliced.matrix <- velocyto.R::filter.genes.by.cluster.expression( emat = GetAssayData(object = object, assay = unspliced), clusters = clusters, min.max.cluster.average = unspliced.average ) if (verbose) { message("Calculating embedding distance matrix") } cell.dist <- as.dist( m = 1 - velocyto.R::armaCor( mat = t(x = Embeddings(object = object, reduction = reduction)) ) ) # Set arguments args <- list(...) defaults <- as.list(x = formals(fun = velocyto.R::gene.relative.velocity.estimates)) args <- args[intersect(x = names(x = args), y = names(x = defaults))] defaults.use <- setdiff(x = names(x = defaults), y = names(x = args)) args[defaults.use] <- defaults[defaults.use] args$emat <- spliced.matrix args$nmat <- unspliced.matrix args$smat <- ambiguous %iff% GetAssayData(object = object, assay = ambiguous) args$steady.state.cells <- cells %||% colnames(x = object) args$cell.dist <- cell.dist args$cellKNN <- graph %iff% object[[graph]] args$n.cores <- ncores args$verbose <- verbose # Run velocity sink(file = stderr(), type = 'output') on.exit(expr = sink()) cd <- do.call(what = velocyto.R::gene.relative.velocity.estimates, args = args) Tool(object = object) <- cd return(object) } #' RNA Velocity Plot #' #' @inheritParams Seurat::DimPlot #' @param ... Extra parameters passed on to \code{\link[velocyto.R]{show.velocity.on.embedding.cor}} #' #' @return Nothing, shows plot #' #' @importFrom Seurat Tool Embeddings #' # @export #' #' @keywords internal #' #' @seealso \code{\link[velocyto.R]{show.velocity.on.embedding.cor}} #' VeloPlot <- function( object, reduction = NULL, ... ) { .NotYetImplemented() CheckPackage(package = 'velocyto-team/velocyto.R', repository = 'github') velocity <- Tool(object = object, slot = 'RunVelocity') if (is.null(x = velocity)) { stop("Please run RunVelocity on this Seurat object") } reduction <- reduction %||% { default.reductions <- c("umap", "tsne", "pca") object.reductions <- Filter( f = function(x) { return(inherits(x = object[[x]], what = 'DimReduc')) }, x = names(x = object) ) reduc.use <- min(which(x = default.reductions %in% object.reductions)) default.reductions[reduc.use] } embeddings <- Embeddings(object = object, reduction = reduction) velocyto.R::show.velocity.on.embedding.cor( emb = embeddings, vel = velocity, ... ) return(invisible(x = NULL)) } ================================================ FILE: README.md ================================================ # SeuratWrappers SeuratWrappers is a collection of community-provided methods and extensions for [Seurat](https://satijalab.org/seurat/), curated by the Satija Lab at NYGC. These methods comprise functionality not presently found in Seurat, and are able to be updated much more frequently. Please see our [contribution guide](https://github.com/satijalab/seurat.wrappers/wiki) for assistance and guidelines in developing and adding new methods to SeuratWrappers Individual method vignettes can be found in the [`docs/`](https://github.com/satijalab/seurat.wrappers/tree/master/docs) directory, we recommend looking at the standard markdown (`*.md`) files when viewing on GitHub Installation can be accomplished through [remotes](https://cran.r-project.org/package=remotes) ```R remotes::install_github('satijalab/seurat-wrappers') ``` ## Method Listing | Package | Vignette | Reference | Source | | ------- | -------- | --------- | ------ | | Monocle 3 | [Calculating Trajectories with Monocle 3 and Seurat](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/monocle3.html) | Cao et al, Nature 2019 | https://cole-trapnell-lab.github.io/monocle3 | | scVelo | [Estimating RNA Velocity using Seurat and scVelo](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/scvelo.html) | Bergen et al, bioRxiv 2019 | https://scvelo.readthedocs.io | | CoGAPS | [Running CoGAPS on Seurat Objects](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/cogaps.html) | Stein-O’Brien et al, Cell Systems 2019 | https://www.bioconductor.org/packages/release/bioc/html/CoGAPS.html | | glmpca | [Running GLM-PCA on a Seurat Object](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/glmpca.html) | Townes et al, Genome Biology 2019 | https://github.com/willtownes/glmpca | | Conos | [Integration of datasets using Conos](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/conos.html) | Barkas et al, Nature Methods 2019 | https://github.com/hms-dbmi/conos | | LIGER | [Integrating Seurat objects using LIGER](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/liger.html) | Welch et al, Cell 2019 | https://github.com/MacoskoLab/liger | | fastMNN | [Running fastMNN on Seurat Objects](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/fast_mnn.html) | Nature Biotechnology 2018 | https://bioconductor.org/packages/release/bioc/html/batchelor.html | | Harmony | [Integration of datasets using Harmony](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/harmony.html) | Korsunsky et al, bioRxiv 2018 | https://github.com/immunogenomics/harmony | | ALRA | [Zero-preserving imputation with ALRA](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/alra.html) | Linderman et al, bioRxiv 2018 | https://github.com/KlugerLab/ALRA | | Velocity | [Estimating RNA Velocity using Seurat](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/velocity.html) | La Manno et al, Nature 2018 | https://velocyto.org | | schex | [Using schex with Seurat](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/schex.html) | Freytag, R package 2019 | https://github.com/SaskiaFreytag/schex | | alevin | [Import alevin counts into Seurat](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/alevin.html) | Srivastava et. al., Genome Biology 2019 | https://github.com/k3yavi/alevin-Rtools | | Nebulosa | [Visualization of gene expression with Nebulosa](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/nebulosa.html) | Jose Alquicira-Hernandez and Joseph E. Powell, _Under Review_ | https://github.com/powellgenomicslab/Nebulosa | | CIPR | [Using CIPR with human PBMC data](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/cipr.html) | Ekiz et. al., BMC Bioinformatics 2020 | https://github.com/atakanekiz/CIPR-Package | | miQC | [Running miQC on Seurat objects](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/miQC.html) | Hippen et. al., bioRxiv 2021 | https://github.com/greenelab/miQC | | tricycle | [Running estimate_cycle_position from tricycle on Seurat Objects](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/tricycle.html) | Zheng et. al., bioRxiv 2021 | https://www.bioconductor.org/packages/release/bioc/html/tricycle.html | | PaCMAP | [Running PaCMAP on Seurat Objects](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/pacmap.html) | Wang et. al, JMLR 2021; Huang et. al, Communications Biology 2022 | https://github.com/YingfanWang/PaCMAP | ================================================ FILE: docs/README.md ================================================ # SeuratWrappers Vignettes This directory contains individual method vignettes for each method provided by SeuratWrappers. Generally speaking, each vignette will have three files. Each file has the same content, just formatted and rendered differently: - `*.Rmd`: these are the source Rmarkdown files used to generate the other files. Viewing them will show only the raw code - `*.html`: these are knitted HTML files. Viewing them on GitHub will show only the raw HTML code. To view rendered HTML, checkout our [vignettes](https://satijalab.org/seurat/vignettes.html) page or use the wonderful [GitHub HTML Preview service](https://htmlpreview.github.io/); links to rendered HTMLs are also provided below - `*.md`: these are knited Markdown files. Viewing them will show the rendered Markdown content, useful for browsing here on GitHub ## Method Listing | Package | Vignette | Reference | Source | | ------- | -------- | --------- | ------ | | Conos | [Integration of datasets using Conos](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/conos.html) | Barkas et al, Nature Methods 2019 | https://github.com/hms-dbmi/conos | | LIGER | [Integrating Seurat objects using LIGER](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/liger.html) | Welch et al, Cell 2019 | https://github.com/MacoskoLab/liger | | fastMNN | [Running fastMNN on Seurat Objects](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/fast_mnn.html) | Nature Biotechnology 2018 | https://bioconductor.org/packages/release/bioc/html/scran.html | | Harmony | [Integration of datasets using Harmony](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/harmony.html) | Korsunsky et al, bioRxiv 2018 | https://github.com/immunogenomics/harmony | | ALRA | [Zero-preserving imputation with ALRA](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/alra.html) | Linderman et al, bioRxiv 2018 | https://github.com/KlugerLab/ALRA | | Velocity | [Estimating RNA Velocity using Seurat](http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/velocity.html) | La Manno et al, Nature 2018 | https://velocyto.org | ================================================ FILE: docs/alevin.Rmd ================================================ --- title: "Import alevin counts & generate Seurat object" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: github_document: html_preview: true toc: true toc_depth: 3 fig_width: 16 html_document: df_print: kable theme: united fig_height: 5 fig_width: 16 out_height: 4 --- This vignette demonstrates the import of alevin quantified counts into Seurat. Commands and parameters are based off of the [alevin tutorial](https://combine-lab.github.io/alevin-tutorial/2018/running-alevin/). If you use alevin in your work, please cite: > *Alevin efficiently estimates accurate gene abundances from dscRNA-seq data* > > Avi Srivastava, Laraib Malik, Tom Smith, Ian Sudbery & Rob Patro > > Genome Biology, 2019. > > doi: [10.1186/s13059-019-1670-y](https://doi.org/10.1186/s13059-019-1670-y) > > GitHub: https://github.com/COMBINE-lab/salmon ```{r setup, include=FALSE} knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE ) ``` Prerequisites to install: * [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) * [tximport](https://bioconductor.org/packages/tximport) ```{r packages, eval=FALSE} library(SeuratWrappers) library(tximport) ``` ## {.tabset .tabset-pills} ### Import alevin quantified counts ```{r pbmcsca, eval = FALSE, cache=TRUE} pbmc <- ReadAlevin("~/alevin_out/alevin/quants_mat.gz") ``` ================================================ FILE: docs/alevin.html ================================================

Import alevin counts & generate Seurat object

Compiled: May 18, 2020

This vignette demonstrates the import of alevin quantified counts into Seurat. Commands and parameters are based off of the alevin tutorial. If you use alevin in your work, please cite:

Alevin efficiently estimates accurate gene abundances from dscRNA-seq data

Avi Srivastava, Laraib Malik, Tom Smith, Ian Sudbery & Rob Patro

Genome Biology, 2019.

doi: 10.1186/s13059-019-1670-y

GitHub: https://github.com/COMBINE-lab/salmon

Prerequisites to install:

Import alevin quantified counts

================================================ FILE: docs/alevin.md ================================================ Import alevin counts & generate Seurat object ================ Compiled: May 18, 2020 This vignette demonstrates the import of alevin quantified counts into Seurat. Commands and parameters are based off of the [alevin tutorial](https://combine-lab.github.io/alevin-tutorial/2018/running-alevin/). If you use alevin in your work, please cite: > *Alevin efficiently estimates accurate gene abundances from dscRNA-seq > data* > > Avi Srivastava, Laraib Malik, Tom Smith, Ian Sudbery & Rob Patro > > Genome Biology, 2019. > > doi: > [10.1186/s13059-019-1670-y](https://doi.org/10.1186/s13059-019-1670-y) > > GitHub: Prerequisites to install: - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) - [tximport](https://bioconductor.org/packages/tximport) ``` r library(SeuratWrappers) library(tximport) ``` ## ### Import alevin quantified counts ``` r pbmc <- ReadAlevin("~/alevin_out/alevin/quants_mat.gz") ``` ================================================ FILE: docs/alra.Rmd ================================================ --- title: "Zero-preserving imputation with ALRA" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: github_document: html_preview: false toc: false html_document: df_print: kable theme: united --- This vigettte demonstrates how to run ALRA on Seurat objects, which aims to recover missing values in scRNA-seq data through imputation. If you use ALRA, please cite: > *Zero-preserving imputation of scRNA-seq data using low-rank approximation* > > George C. Linderman, Jun Zhao, Yuval Kluger > > biorxiv, 2018. > > doi: https://doi.org/10.1101/397588 > > GitHub: https://github.com/KlugerLab/ALRA ```{r setup, include=FALSE} knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, fig.height = 20, fig.width = 16 ) ``` Prerequisites to install: * [Seurat](https://satijalab.org/seurat/install) * [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) * [SeuratData](https://github.com/satijalab/seurat-data) ```{r packages} library(Seurat) library(SeuratData) library(SeuratWrappers) library(dplyr) ``` ### scRNA-seq imputation on pbmc3k To learn more about this dataset, type `?pbmc3k` ```{r sctransform, results='hide', cache=TRUE} InstallData("pbmc3k") data("pbmc3k") # Initial processing and visualization pbmc3k <- SCTransform(pbmc3k) %>% RunPCA() %>% RunUMAP(dims = 1:30) # run ALRA, creates alra assay of imputed values pbmc3k <- RunALRA(pbmc3k) ``` ```{r explore, results='hide', cache=TRUE} # visualize original and imputed values pbmc3k <- NormalizeData(pbmc3k, assay = 'RNA') features.plot <- c('CD3D', 'MS4A1', 'CD8A', 'GZMK', 'NCAM1', 'FCGR3A') DefaultAssay(pbmc3k) <- 'RNA' plot1 <- FeaturePlot(pbmc3k, features.plot, ncol = 2) DefaultAssay(pbmc3k) <- 'alra' plot2 <- FeaturePlot(pbmc3k, features.plot, ncol = 2, cols = c('lightgrey','red')) CombinePlots(list(plot1, plot2), ncol = 1) ``` ================================================ FILE: docs/alra.html ================================================ Zero-preserving imputation with ALRA

This vigettte demonstrates how to run ALRA on Seurat objects, which aims to recover missing values in scRNA-seq data through imputation. If you use ALRA, please cite:

Zero-preserving imputation of scRNA-seq data using low-rank approximation

George C. Linderman, Jun Zhao, Yuval Kluger

biorxiv, 2018.

doi: https://doi.org/10.1101/397588

GitHub: https://github.com/KlugerLab/ALRA

Prerequisites to install:

library(Seurat)
library(SeuratData)
library(SeuratWrappers)
library(dplyr)

scRNA-seq imputation on pbmc3k

To learn more about this dataset, type ?pbmc3k

InstallData("pbmc3k")
data("pbmc3k")
# Initial processing and visualization
pbmc3k <- SCTransform(pbmc3k) %>% RunPCA() %>% RunUMAP(dims = 1:30)
# run ALRA, creates alra assay of imputed values
pbmc3k <- RunALRA(pbmc3k)
# visualize original and imputed values
pbmc3k <- NormalizeData(pbmc3k, assay = "RNA")
features.plot <- c("CD3D", "MS4A1", "CD8A", "GZMK", "NCAM1", "FCGR3A")
DefaultAssay(pbmc3k) <- "RNA"
plot1 <- FeaturePlot(pbmc3k, features.plot, ncol = 2)
DefaultAssay(pbmc3k) <- "alra"
plot2 <- FeaturePlot(pbmc3k, features.plot, ncol = 2, cols = c("lightgrey", "red"))
CombinePlots(list(plot1, plot2), ncol = 1)

================================================ FILE: docs/alra.md ================================================ Zero-preserving imputation with ALRA ================ Compiled: August 16, 2019 This vigettte demonstrates how to run ALRA on Seurat objects, which aims to recover missing values in scRNA-seq data through imputation. If you use ALRA, please cite: > *Zero-preserving imputation of scRNA-seq data using low-rank approximation* > > George C. Linderman, Jun Zhao, Yuval Kluger > > biorxiv, 2018. > > doi: > > GitHub: Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) - [SeuratData](https://github.com/satijalab/seurat-data) ``` r library(Seurat) library(SeuratData) library(SeuratWrappers) library(dplyr) ``` ### scRNA-seq imputation on pbmc3k To learn more about this dataset, type `?pbmc3k` ``` r InstallData("pbmc3k") data("pbmc3k") # Initial processing and visualization pbmc3k <- SCTransform(pbmc3k) %>% RunPCA() %>% RunUMAP(dims = 1:30) # run ALRA, creates alra assay of imputed values pbmc3k <- RunALRA(pbmc3k) ``` ``` r # visualize original and imputed values pbmc3k <- NormalizeData(pbmc3k, assay = "RNA") features.plot <- c("CD3D", "MS4A1", "CD8A", "GZMK", "NCAM1", "FCGR3A") DefaultAssay(pbmc3k) <- "RNA" plot1 <- FeaturePlot(pbmc3k, features.plot, ncol = 2) DefaultAssay(pbmc3k) <- "alra" plot2 <- FeaturePlot(pbmc3k, features.plot, ncol = 2, cols = c("lightgrey", "red")) CombinePlots(list(plot1, plot2), ncol = 1) ``` ![](alra_files/figure-markdown_github/explore-1.png) ================================================ FILE: docs/banksy.Rmd ================================================ --- title: "Running BANKSY with Seurat" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: github_document: html_preview: true toc: true html_document: df_print: kable theme: simplex --- ```{r setup, include=FALSE} knitr::opts_chunk$set( message = FALSE, warning = FALSE ) ``` ```{r include=FALSE} t0 <- Sys.time() ``` ## Introduction In this vignette, we describe how to run BANKSY with Seurat objects. If you use BANKSY in your research, please cite > *BANKSY unifies cell typing and tissue domain segmentation for scalable spatial omics data analysis* > > Vipul Singhal, Nigel Chou, Joseph Lee, Yifei Yue, Jinyue Liu, Wan Kee Chock, Li Lin, Yun-Ching Chang, Erica Mei Ling Teo, Jonathan Aow, Hwee Kuan Lee, Kok Hao Chen & Shyam Prabhakar > > Nature Genetics, 2024 > > doi: [10.1038/s41588-024-01664-3](https://doi.org/10.1038/s41588-024-01664-3) > > Website: https://prabhakarlab.github.io/Banksy BANKSY is a method that incorporates neighborhood information for clustering spatial omics data. By doing so, BANKSY is able to - improve cell-type assignment in noisy data - distinguish subtly different cell-types stratified by microenvironment - identify spatial domains sharing the same microenvironment The amount of neighborhood information incorporated is controlled by a parameter `lambda` in [0,1], with higher values giving more weight to the neighbourhood information during clustering. ## Overview The `RunBanksy` function implemented with the *SeuratWrappers* package allows users to run BANKSY with Seurat objects. We describe two options of running `RunBanksy`. The first is within Seurat's spatial framework (see [here](https://satijalab.org/seurat/articles/spatial_vignette.html) and [here](https://satijalab.org/seurat/articles/spatial_vignette_2.html)) and requires a Seurat object and a lambda parameter as mandatory input. The second option works with Seurat objects that do not have spatial information stored within, and therefore requires an additional argument giving the locations of the cell centroids or spots. **Caveat**: `ScaleData` should not be run after a call to `RunBanksy`; `RunBanksy` populates the `scale.data` slot with the scaled BANKSY matrix. Calling `ScaleData` after `RunBanksy` performs gene-wise z-scaling, negating the effect of `lambda`. Prerequisites to install: * [Seurat](https://satijalab.org/seurat/install) * [SeuratData](https://github.com/satijalab/seurat-data) * [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) * [Banksy](https://github.com/prabhakarlab/Banksy/) ```{r lib_load} library(Banksy) library(Seurat) library(SeuratData) library(SeuratWrappers) library(ggplot2) library(gridExtra) library(pals) # Kelly palette for visualization mypal <- kelly()[-1] ``` ## Running BANKSY within Seurat's spatial framework We demonstrate how to run BANKSY within Seurat's spatial analysis framework with a mouse hippocampus Slide-seq v2 dataset from the *SeuratData* package. After installing *SeuratData*, the data can be accessed as follows: ```{r ss_install, eval = FALSE} InstallData('ssHippo') ss.hippo <- LoadData("ssHippo") ``` ```{r, include=FALSE} ss.hippo <- LoadData("ssHippo") ``` We perform simple preprocessing by filtering beads with high mito percentage and keeping only beads within the 5th and 98th percentile of total UMI counts. To keep runtime of this vignette short, we downsample the data to 10,000 beads. ```{r ss_load} # Filtering ss.hippo[['percent.mt']] <- PercentageFeatureSet(ss.hippo, pattern = '^MT-') ss.hippo <- subset(ss.hippo, percent.mt < 10 & nCount_Spatial > quantile(ss.hippo$nCount_Spatial, 0.05) & nCount_Spatial < quantile(ss.hippo$nCount_Spatial, 0.98)) # Downsample set.seed(42) ss.hippo <- ss.hippo[,sample(colnames(ss.hippo), 1e4)] ``` Next, normalize the data and find variable genes: ```{r ss_norm} # Normalize ss.hippo <- NormalizeData(ss.hippo) ss.hippo <- FindVariableFeatures(ss.hippo) ss.hippo <- ScaleData(ss.hippo) ``` To run BANKSY, we specify the following: - `lambda`: a numeric value in [0,1]. With low values of lambda, BANKSY operates in cell-typing mode, while high values of lambda find spatial domains. - `assay` and `slot`: determines where to pull the expression data from - `features`: specifies features for downstream analysis. This can be `'all'`, `'variable'` or a subset of features. - `k_geom`: the number of neighbors that defines a cell's neighborhood Call `?RunBanksy` for more details on function parameters. ```{r ss_banksy} # Run BANKSY ss.hippo <- RunBanksy(ss.hippo, lambda = 0.2, verbose=TRUE, assay = 'Spatial', slot = 'data', features = 'variable', k_geom = 15) ss.hippo ``` Note that the `RunBanksy` function sets the default assay to `BANKSY` ( determined by the `assay_name` argument) and fills the `scale.data` slot. Users should not call `ScaleData` on the `BANKSY` assay as this negates the effects of `lambda`. The rest of the pipeline is similar to the 'default' Seurat pipline. We scale the data and run dimensionality reduction with PCA and UMAP: ```{r ss_pca} # Run PCA and UMAP ss.hippo <- RunPCA(ss.hippo, assay = 'BANKSY', features = rownames(ss.hippo), npcs = 30) ss.hippo <- RunUMAP(ss.hippo, dims = 1:30) ``` Next, find BANKSY clusters: ```{r ss_cluster} # Clustering ss.hippo <- FindNeighbors(ss.hippo, dims = 1:30) ss.hippo <- FindClusters(ss.hippo, resolution = 0.5) ``` Visualize the UMAP and Spatial plot: ```{r ss_viz, fig.align='center', fig.height=5, fig.width=10} # Viz grid.arrange( DimPlot(ss.hippo, pt.size = 0.25, label = TRUE, label.size = 3, repel = TRUE), SpatialDimPlot(ss.hippo, stroke = NA, label = TRUE, label.size = 3, repel = TRUE, alpha = 0.5, pt.size.factor = 2), ncol = 2 ) ``` Find markers based on the BANKSY clusters and visualize them. Here, we find differentially expressed genes between the CA1 and CA3 regions. ```{r ss_markers, fig.align='center', fig.height=5, fig.width=10} # Find markers DefaultAssay(ss.hippo) <- 'Spatial' markers <- FindMarkers(ss.hippo, ident.1 = 4, ident.2 = 9, only.pos = F, logfc.threshold = 1, min.pct = 0.5) markers <- markers[markers$p_val_adj < 0.01,] markers genes <- c('ATP2B1', 'CHGB') SpatialFeaturePlot(ss.hippo, features = genes, pt.size.factor = 3, stroke = NA, alpha = 0.5, max.cutoff = 'q95') ``` ## Running BANKSY with locations provided explicitly One can also call `RunBanksy` on a Seurat object created from counts by providing the location of cell centroids or spots explicitly. In this case, the locations must be stored as metadata. Here, we use a mouse hippocampus VeraFISH dataset provided with the *Banksy* package. ```{r hippo_load} data(hippocampus) head(hippocampus$expression[,1:5]) head(hippocampus$locations) ``` Construct the Seurat object by storing the locations of cell centroids as metadata. We keep cells with total count between 5th and 98th percentile: ```{r hippo_create} # Create manually vf.hippo <- CreateSeuratObject(counts = hippocampus$expression, meta.data = hippocampus$locations) vf.hippo <- subset(vf.hippo, nCount_RNA > quantile(vf.hippo$nCount_RNA, 0.05) & nCount_RNA < quantile(vf.hippo$nCount_RNA, 0.98)) ``` Next, we normalize the data by library size and scale the data: ```{r hippo_norm} # Normalize vf.hippo <- NormalizeData(vf.hippo, scale.factor = 100, normalization.method = 'RC') vf.hippo <- ScaleData(vf.hippo) ``` Now, run BANKSY. Here, we provide the column names of the x and y spatial coordinates as stored in the metadata to `dimx` and `dimy` respectively: ```{r hippo_banksy} # Run BANKSY vf.hippo <- RunBanksy(vf.hippo, lambda = 0.2, dimx = 'sdimx', dimy = 'sdimy', assay = 'RNA', slot = 'data', features = 'all', k_geom = 10) ``` Note that the `RunBanksy` function sets the default assay to `BANKSY` ( determined by the `assay_name` argument) and fills the `scale.data` slot. Users should not call `ScaleData` on the `BANKSY` assay as this negates the effects of `lambda`. Run PCA on the BANKSY matrix: ```{r hippo_pca} # PCA vf.hippo <- RunPCA(vf.hippo, assay = 'BANKSY', features = rownames(vf.hippo), npcs = 20) ``` Find BANKSY clusters: ```{r hippo_cluster} # Cluster vf.hippo <- FindNeighbors(vf.hippo, dims = 1:20) vf.hippo <- FindClusters(vf.hippo, resolution = 0.5) ``` Visualise BANKSY clusters in spatial dimensions: ```{r hippo_viz, fig.align='center', fig.height=6, fig.width=7} # Viz FeatureScatter(vf.hippo, 'sdimx', 'sdimy', cols = mypal, pt.size = 0.75) FeatureScatter(vf.hippo, 'sdimx', 'sdimy', cols = mypal, pt.size = 0.1) + facet_wrap(~ colors) ``` Find markers and visualise them. Here, we do so for a cluster defined by a thin layer of cells expressing Gfap. We also write a simple function `genePlot` that plots marker genes in spatial dimensions. ```{r hippo_gene, fig.align='center', fig.height=5, fig.width=6} # Find markers DefaultAssay(vf.hippo) <- 'RNA' markers <- FindMarkers(vf.hippo, ident.1 = 6, only.pos = TRUE) genePlot <- function(object, dimx, dimy, gene, assay = 'RNA', slot = 'scale.data', q.low = 0.01, q.high = 0.99, col.low='blue', col.high='red') { val <- GetAssayData(object, assay=assay, slot=slot)[gene,] val.low <- quantile(val, q.low) val.high <- quantile(val, q.high) val[val < val.low] <- val.low val[val > val.high] <- val.high pdf <- data.frame(x=object[[dimx]], y=object[[dimy]], gene=val) colnames(pdf) <- c('sdimx','sdimy', 'gene') ggplot(pdf, aes(x=sdimx,y=sdimy,color=gene)) + geom_point(size = 1) + theme_minimal() + theme(legend.title = element_blank()) + scale_color_gradient2(low = col.low, high = col.high) + ggtitle(gene) } genePlot(vf.hippo, 'sdimx', 'sdimy', 'Gfap') ``` ## Multi-sample analysis This section demonstrate demonstrates multi-sample analysis. Such an approach is appropriate when analysing multiple spatial omics datasets with non-contiguous spatial coordinates, and when large batch effects are not present. Here, we use a mouse hippocampus VeraFISH dataset provided with the *Banksy* package. ```{r} data(hippocampus) head(hippocampus$expression[,1:5]) head(hippocampus$locations) ``` For demonstration purposes, we create three separate datasets by splitting the data. ```{r} # Number of groups n_groups = 3 group_names = paste0('group', seq(n_groups)) group_size = 1000 starts = seq(1, by=group_size, length.out=n_groups) ends = starts + group_size - 1 # List of Seurat objects seu_list = lapply(seq(n_groups), function(i) { idx = seq(starts[i], ends[i]) seu = CreateSeuratObject( counts = hippocampus$expression[,idx], meta.data = data.frame(scale(hippocampus$locations[idx,], scale = FALSE)) ) # Set original identity of cell seu$orig.ident = group_names[i] seu }) seu_list ``` Perform normalisation for each dataset. ```{r} seu_list = lapply(seu_list, NormalizeData, scale.factor = 100, normalization.method = 'RC') ``` Merge the datasets. Note that the spatial coordinates overlap. ```{r multi-spatial, fig.align='center', fig.height=6, fig.width=9} # Merge seu = Reduce(merge, seu_list) seu = JoinLayers(seu) # run this for Seurat v5 objects # Plot spatial coordinates colored by group plot(FetchData(seu, c('sdimx', 'sdimy')), col = factor(seu$orig.ident)) ``` Now run BANKSY. For multi-sample analysis, the argument `group` must be provided, which specifies the name of the metadata column that gives the assignment of each cell or spot to its original Seurat object. Here, we use `orig.ident`. Internally, providing the `group` argument tells the function to compute neighborhood matrices based on locations staggered by `group`, ensuring that cells from different spatial datasets do not overlap. The staggered locations are stored in the metadata for sanity checking. The `split.scale` argument allows for within-group scaling, accounting for minor differences in datasets. ```{r} # Grouping variable head(seu@meta.data) table(seu$orig.ident) # Run BANKSY seu = RunBanksy(seu, lambda = 0.2, assay = 'RNA', slot = 'data', dimx = 'sdimx', dimy = 'sdimy', features = 'all', group = 'orig.ident', split.scale = TRUE, k_geom = 15) # Staggered locations added to metadata head(seu@meta.data) ``` The rest of the workflow follows as before: ```{r} seu = RunPCA(seu, assay = 'BANKSY', features = rownames(seu), npcs = 30) seu = RunUMAP(seu, dims = 1:30) seu = FindNeighbors(seu, dims = 1:30) seu = FindClusters(seu, resolution = 1) ``` Visualise clusters: ```{r multi-umap, fig.align='center', fig.height=5, fig.width=5} mypal <- kelly()[-1] DimPlot(seu, pt.size = 0.25, label = TRUE, label.size = 3, cols = mypal) ``` ```{r multi-spatial-staggered, fig.align='center', fig.height=3, fig.width=9} FeatureScatter(seu, 'staggered_sdimx', 'staggered_sdimy', pt.size = 0.75, cols = mypal) ``` ## Spatial data integration with Harmony BANKSY can be used with Harmony for integrating multiple spatial omics datasets in the presence of strong batch effects. Download the data. ```{r, eval=FALSE} library(spatialLIBD) library(ExperimentHub) library(harmony) ehub <- ExperimentHub::ExperimentHub() spe <- spatialLIBD::fetch_data(type = "spe", eh = ehub) imgData(spe) <- NULL assay(spe, "logcounts") <- NULL reducedDims(spe) <- NULL rowData(spe) <- NULL colData(spe) <- DataFrame( sample_id = spe$sample_id, clust_annotation = factor( addNA(spe$layer_guess_reordered_short), exclude = NULL, labels = seq(8) ), in_tissue = spe$in_tissue, row.names = colnames(spe) ) invisible(gc()) # Subset to first sample of each subject sample_names <- c("151507", "151669", "151673") spe_list <- lapply(sample_names, function(x) spe[, spe$sample_id == x]) rm(spe) invisible(gc()) ``` Normalise the data and compute highly variable features. ```{r, eval=FALSE} # Convert to Seurat and Normalize data seu_list <- lapply(spe_list, function(x) { x <- as.Seurat(x, data = NULL) NormalizeData(x, scale.factor = 3000, normalization.method = 'RC') }) # Compute HVGs for each dataset and take the union hvgs <- lapply(seu_list, function(x) { VariableFeatures(FindVariableFeatures(x, nfeatures = 2000)) }) hvgs <- Reduce(union, hvgs) # Subset to HVGs seu_list <- lapply(seu_list, function(x) x[hvgs,]) seu <- Reduce(merge, seu_list) locs <- do.call(rbind.data.frame, lapply(spe_list, spatialCoords)) seu@meta.data <- cbind(seu@meta.data, locs) seu ``` Run BANKSY. When analysing multiple samples, the argument `group` must be provided, which specifies the name of the metadata column that gives the assignment of each cell or spot to its original Seurat object. Here, we use `sample_id`. Internally, providing the `group` argument tells the function to compute neighborhood matrices based on locations staggered by `group`, ensuring that cells from different spatial datasets do not overlap. The staggered locations are stored in the metadata for sanity checking. Within-group scaling has little effect in the presence of strong batch effects, hence, we set `split.scale=FALSE` for efficiency. ```{r, eval=FALSE} # Grouping variable head(seu@meta.data) table(seu$sample_id) sdimx <- 'pxl_col_in_fullres' sdimy <- 'pxl_row_in_fullres' # Run BANKSY seu <- RunBanksy(seu, lambda = 0.2, assay = 'originalexp', slot = 'data', dimx = sdimx, dimy = sdimy, features = 'all', group = 'sample_id', split.scale = FALSE, k_geom = 6) ``` Compute a spatially-aware embedding with PCA on the BANKSY matrix, and run Harmony on this embedding. ```{r, eval=FALSE} seu <- RunPCA(seu, assay = 'BANKSY', features = rownames(seu), npcs = 10) seu <- RunHarmony(seu, group.by.vars='sample_id') ``` The rest of the workflow follows as before: ```{r, eval=FALSE} seu <- RunUMAP(seu, dims = 1:10, reduction = 'harmony') seu <- FindNeighbors(seu, dims = 1:10, reduction = 'harmony') seu <- FindClusters(seu, resolution = 0.4) ``` Visualise clusters: ```{r, eval=FALSE} DimPlot(seu, pt.size = 0.25, label = TRUE, label.size = 3, cols = mypal) FeatureScatter(seu, 'staggered_sdimx', 'staggered_sdimy', cols = mypal, pt.size = 0.75) ``` ## Getting help For more information, visit https://github.com/prabhakarlab/Banksy.
Vignette runtime ```{r echo=FALSE} Sys.time() - t0 ```
Session info ```{r, sess} sessionInfo() ```
================================================ FILE: docs/banksy.md ================================================ Running BANKSY with Seurat ================ Compiled: April 04, 2024 - [Introduction](#introduction) - [Overview](#overview) - [Running BANKSY within Seurat’s spatial framework](#running-banksy-within-seurats-spatial-framework) - [Running BANKSY with locations provided explicitly](#running-banksy-with-locations-provided-explicitly) - [Multi-sample analysis](#multi-sample-analysis) - [Spatial data integration with Harmony](#spatial-data-integration-with-harmony) - [Getting help](#getting-help) ## Introduction In this vignette, we describe how to run BANKSY with Seurat objects. If you use BANKSY in your research, please cite > *BANKSY unifies cell typing and tissue domain segmentation for > scalable spatial omics data analysis* > > Vipul Singhal, Nigel Chou, Joseph Lee, Yifei Yue, Jinyue Liu, Wan Kee > Chock, Li Lin, Yun-Ching Chang, Erica Mei Ling Teo, Jonathan Aow, Hwee > Kuan Lee, Kok Hao Chen & Shyam Prabhakar > > Nature Genetics, 2024 > > doi: > [10.1038/s41588-024-01664-3](https://doi.org/10.1038/s41588-024-01664-3) > > Website: BANKSY is a method that incorporates neighborhood information for clustering spatial omics data. By doing so, BANKSY is able to - improve cell-type assignment in noisy data - distinguish subtly different cell-types stratified by microenvironment - identify spatial domains sharing the same microenvironment The amount of neighborhood information incorporated is controlled by a parameter `lambda` in \[0,1\], with higher values giving more weight to the neighbourhood information during clustering. ## Overview The `RunBanksy` function implemented with the *SeuratWrappers* package allows users to run BANKSY with Seurat objects. We describe two options of running `RunBanksy`. The first is within Seurat’s spatial framework (see [here](https://satijalab.org/seurat/articles/spatial_vignette.html) and [here](https://satijalab.org/seurat/articles/spatial_vignette_2.html)) and requires a Seurat object and a lambda parameter as mandatory input. The second option works with Seurat objects that do not have spatial information stored within, and therefore requires an additional argument giving the locations of the cell centroids or spots. **Caveat**: `ScaleData` should not be run after a call to `RunBanksy`; `RunBanksy` populates the `scale.data` slot with the scaled BANKSY matrix. Calling `ScaleData` after `RunBanksy` performs gene-wise z-scaling, negating the effect of `lambda`. Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [SeuratData](https://github.com/satijalab/seurat-data) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) - [Banksy](https://github.com/prabhakarlab/Banksy/) ``` r library(Banksy) library(Seurat) library(SeuratData) library(SeuratWrappers) library(ggplot2) library(gridExtra) library(pals) # Kelly palette for visualization mypal <- kelly()[-1] ``` ## Running BANKSY within Seurat’s spatial framework We demonstrate how to run BANKSY within Seurat’s spatial analysis framework with a mouse hippocampus Slide-seq v2 dataset from the *SeuratData* package. After installing *SeuratData*, the data can be accessed as follows: ``` r InstallData('ssHippo') ss.hippo <- LoadData("ssHippo") ``` We perform simple preprocessing by filtering beads with high mito percentage and keeping only beads within the 5th and 98th percentile of total UMI counts. To keep runtime of this vignette short, we downsample the data to 10,000 beads. ``` r # Filtering ss.hippo[['percent.mt']] <- PercentageFeatureSet(ss.hippo, pattern = '^MT-') ss.hippo <- subset(ss.hippo, percent.mt < 10 & nCount_Spatial > quantile(ss.hippo$nCount_Spatial, 0.05) & nCount_Spatial < quantile(ss.hippo$nCount_Spatial, 0.98)) # Downsample set.seed(42) ss.hippo <- ss.hippo[,sample(colnames(ss.hippo), 1e4)] ``` Next, normalize the data and find variable genes: ``` r # Normalize ss.hippo <- NormalizeData(ss.hippo) ss.hippo <- FindVariableFeatures(ss.hippo) ss.hippo <- ScaleData(ss.hippo) ``` To run BANKSY, we specify the following: - `lambda`: a numeric value in \[0,1\]. With low values of lambda, BANKSY operates in cell-typing mode, while high values of lambda find spatial domains. - `assay` and `slot`: determines where to pull the expression data from - `features`: specifies features for downstream analysis. This can be `'all'`, `'variable'` or a subset of features. - `k_geom`: the number of neighbors that defines a cell’s neighborhood Call `?RunBanksy` for more details on function parameters. ``` r # Run BANKSY ss.hippo <- RunBanksy(ss.hippo, lambda = 0.2, verbose=TRUE, assay = 'Spatial', slot = 'data', features = 'variable', k_geom = 15) ss.hippo ``` ## An object of class Seurat ## 27264 features across 10000 samples within 2 assays ## Active assay: BANKSY (4000 features, 0 variable features) ## 2 layers present: data, scale.data ## 1 other assay present: Spatial ## 1 image present: image Note that the `RunBanksy` function sets the default assay to `BANKSY` ( determined by the `assay_name` argument) and fills the `scale.data` slot. Users should not call `ScaleData` on the `BANKSY` assay as this negates the effects of `lambda`. The rest of the pipeline is similar to the ‘default’ Seurat pipline. We scale the data and run dimensionality reduction with PCA and UMAP: ``` r # Run PCA and UMAP ss.hippo <- RunPCA(ss.hippo, assay = 'BANKSY', features = rownames(ss.hippo), npcs = 30) ss.hippo <- RunUMAP(ss.hippo, dims = 1:30) ``` Next, find BANKSY clusters: ``` r # Clustering ss.hippo <- FindNeighbors(ss.hippo, dims = 1:30) ss.hippo <- FindClusters(ss.hippo, resolution = 0.5) ``` ## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck ## ## Number of nodes: 10000 ## Number of edges: 365658 ## ## Running Louvain algorithm... ## Maximum modularity in 10 random starts: 0.9033 ## Number of communities: 13 ## Elapsed time: 1 seconds Visualize the UMAP and Spatial plot: ``` r # Viz grid.arrange( DimPlot(ss.hippo, pt.size = 0.25, label = TRUE, label.size = 3, repel = TRUE), SpatialDimPlot(ss.hippo, stroke = NA, label = TRUE, label.size = 3, repel = TRUE, alpha = 0.5, pt.size.factor = 2), ncol = 2 ) ``` Find markers based on the BANKSY clusters and visualize them. Here, we find differentially expressed genes between the CA1 and CA3 regions. ``` r # Find markers DefaultAssay(ss.hippo) <- 'Spatial' markers <- FindMarkers(ss.hippo, ident.1 = 4, ident.2 = 9, only.pos = F, logfc.threshold = 1, min.pct = 0.5) markers <- markers[markers$p_val_adj < 0.01,] markers ``` ## p_val avg_log2FC pct.1 pct.2 p_val_adj ## SNAP25 1.127235e-46 -1.260312 0.658 0.823 2.622400e-42 ## CHGB 9.840001e-44 -1.985343 0.439 0.697 2.289178e-39 ## STMN2 1.281230e-24 -1.430138 0.335 0.574 2.980653e-20 ## SYN2 3.272800e-23 -1.609355 0.332 0.564 7.613842e-19 ## ATP2B1 1.545647e-22 1.251540 0.639 0.474 3.595793e-18 ## CPLX2 4.619232e-21 -1.220110 0.289 0.522 1.074618e-16 ## PRKCB 1.276453e-18 1.394809 0.552 0.341 2.969539e-14 ## PCP4 2.006224e-18 -1.269671 0.379 0.578 4.667279e-14 ## TUBB2A 1.330787e-16 -1.054176 0.450 0.629 3.095942e-12 ## DDN 1.784378e-14 1.401976 0.592 0.396 4.151176e-10 ## SNCA 7.596526e-12 -1.022314 0.397 0.544 1.767256e-07 ``` r genes <- c('ATP2B1', 'CHGB') SpatialFeaturePlot(ss.hippo, features = genes, pt.size.factor = 3, stroke = NA, alpha = 0.5, max.cutoff = 'q95') ``` ## Running BANKSY with locations provided explicitly One can also call `RunBanksy` on a Seurat object created from counts by providing the location of cell centroids or spots explicitly. In this case, the locations must be stored as metadata. Here, we use a mouse hippocampus VeraFISH dataset provided with the *Banksy* package. ``` r data(hippocampus) head(hippocampus$expression[,1:5]) ``` ## cell_1276 cell_8890 cell_691 cell_396 cell_9818 ## Sparcl1 45 0 11 22 0 ## Slc1a2 17 0 6 5 0 ## Map 10 0 12 16 0 ## Sqstm1 26 0 0 2 0 ## Atp1a2 0 0 4 3 0 ## Tnc 0 0 0 0 0 ``` r head(hippocampus$locations) ``` ## sdimx sdimy ## cell_1276 -13372.899 15776.37 ## cell_8890 8941.101 15866.37 ## cell_691 -14882.899 15896.37 ## cell_396 -15492.899 15835.37 ## cell_9818 11308.101 15846.37 ## cell_11310 14894.101 15810.37 Construct the Seurat object by storing the locations of cell centroids as metadata. We keep cells with total count between 5th and 98th percentile: ``` r # Create manually vf.hippo <- CreateSeuratObject(counts = hippocampus$expression, meta.data = hippocampus$locations) vf.hippo <- subset(vf.hippo, nCount_RNA > quantile(vf.hippo$nCount_RNA, 0.05) & nCount_RNA < quantile(vf.hippo$nCount_RNA, 0.98)) ``` Next, we normalize the data by library size and scale the data: ``` r # Normalize vf.hippo <- NormalizeData(vf.hippo, scale.factor = 100, normalization.method = 'RC') vf.hippo <- ScaleData(vf.hippo) ``` Now, run BANKSY. Here, we provide the column names of the x and y spatial coordinates as stored in the metadata to `dimx` and `dimy` respectively: ``` r # Run BANKSY vf.hippo <- RunBanksy(vf.hippo, lambda = 0.2, dimx = 'sdimx', dimy = 'sdimy', assay = 'RNA', slot = 'data', features = 'all', k_geom = 10) ``` Note that the `RunBanksy` function sets the default assay to `BANKSY` ( determined by the `assay_name` argument) and fills the `scale.data` slot. Users should not call `ScaleData` on the `BANKSY` assay as this negates the effects of `lambda`. Run PCA on the BANKSY matrix: ``` r # PCA vf.hippo <- RunPCA(vf.hippo, assay = 'BANKSY', features = rownames(vf.hippo), npcs = 20) ``` Find BANKSY clusters: ``` r # Cluster vf.hippo <- FindNeighbors(vf.hippo, dims = 1:20) vf.hippo <- FindClusters(vf.hippo, resolution = 0.5) ``` ## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck ## ## Number of nodes: 10205 ## Number of edges: 446178 ## ## Running Louvain algorithm... ## Maximum modularity in 10 random starts: 0.9099 ## Number of communities: 15 ## Elapsed time: 1 seconds Visualise BANKSY clusters in spatial dimensions: ``` r # Viz FeatureScatter(vf.hippo, 'sdimx', 'sdimy', cols = mypal, pt.size = 0.75) ``` ``` r FeatureScatter(vf.hippo, 'sdimx', 'sdimy', cols = mypal, pt.size = 0.1) + facet_wrap(~ colors) ``` Find markers and visualise them. Here, we do so for a cluster defined by a thin layer of cells expressing Gfap. We also write a simple function `genePlot` that plots marker genes in spatial dimensions. ``` r # Find markers DefaultAssay(vf.hippo) <- 'RNA' markers <- FindMarkers(vf.hippo, ident.1 = 6, only.pos = TRUE) genePlot <- function(object, dimx, dimy, gene, assay = 'RNA', slot = 'scale.data', q.low = 0.01, q.high = 0.99, col.low='blue', col.high='red') { val <- GetAssayData(object, assay=assay, slot=slot)[gene,] val.low <- quantile(val, q.low) val.high <- quantile(val, q.high) val[val < val.low] <- val.low val[val > val.high] <- val.high pdf <- data.frame(x=object[[dimx]], y=object[[dimy]], gene=val) colnames(pdf) <- c('sdimx','sdimy', 'gene') ggplot(pdf, aes(x=sdimx,y=sdimy,color=gene)) + geom_point(size = 1) + theme_minimal() + theme(legend.title = element_blank()) + scale_color_gradient2(low = col.low, high = col.high) + ggtitle(gene) } genePlot(vf.hippo, 'sdimx', 'sdimy', 'Gfap') ``` ## Multi-sample analysis This section demonstrate demonstrates multi-sample analysis. Such an approach is appropriate when analysing multiple spatial omics datasets with non-contiguous spatial coordinates, and when large batch effects are not present. Here, we use a mouse hippocampus VeraFISH dataset provided with the *Banksy* package. ``` r data(hippocampus) head(hippocampus$expression[,1:5]) ``` ## cell_1276 cell_8890 cell_691 cell_396 cell_9818 ## Sparcl1 45 0 11 22 0 ## Slc1a2 17 0 6 5 0 ## Map 10 0 12 16 0 ## Sqstm1 26 0 0 2 0 ## Atp1a2 0 0 4 3 0 ## Tnc 0 0 0 0 0 ``` r head(hippocampus$locations) ``` ## sdimx sdimy ## cell_1276 -13372.899 15776.37 ## cell_8890 8941.101 15866.37 ## cell_691 -14882.899 15896.37 ## cell_396 -15492.899 15835.37 ## cell_9818 11308.101 15846.37 ## cell_11310 14894.101 15810.37 For demonstration purposes, we create three separate datasets by splitting the data. ``` r # Number of groups n_groups = 3 group_names = paste0('group', seq(n_groups)) group_size = 1000 starts = seq(1, by=group_size, length.out=n_groups) ends = starts + group_size - 1 # List of Seurat objects seu_list = lapply(seq(n_groups), function(i) { idx = seq(starts[i], ends[i]) seu = CreateSeuratObject( counts = hippocampus$expression[,idx], meta.data = data.frame(scale(hippocampus$locations[idx,], scale = FALSE)) ) # Set original identity of cell seu$orig.ident = group_names[i] seu }) seu_list ``` ## [[1]] ## An object of class Seurat ## 120 features across 1000 samples within 1 assay ## Active assay: RNA (120 features, 0 variable features) ## 1 layer present: counts ## ## [[2]] ## An object of class Seurat ## 120 features across 1000 samples within 1 assay ## Active assay: RNA (120 features, 0 variable features) ## 1 layer present: counts ## ## [[3]] ## An object of class Seurat ## 120 features across 1000 samples within 1 assay ## Active assay: RNA (120 features, 0 variable features) ## 1 layer present: counts Perform normalisation for each dataset. ``` r seu_list = lapply(seu_list, NormalizeData, scale.factor = 100, normalization.method = 'RC') ``` Merge the datasets. Note that the spatial coordinates overlap. ``` r # Merge seu = Reduce(merge, seu_list) seu = JoinLayers(seu) # run this for Seurat v5 objects # Plot spatial coordinates colored by group plot(FetchData(seu, c('sdimx', 'sdimy')), col = factor(seu$orig.ident)) ``` Now run BANKSY. For multi-sample analysis, the argument `group` must be provided, which specifies the name of the metadata column that gives the assignment of each cell or spot to its original Seurat object. Here, we use `orig.ident`. Internally, providing the `group` argument tells the function to compute neighborhood matrices based on locations staggered by `group`, ensuring that cells from different spatial datasets do not overlap. The staggered locations are stored in the metadata for sanity checking. The `split.scale` argument allows for within-group scaling, accounting for minor differences in datasets. ``` r # Grouping variable head(seu@meta.data) ``` ## orig.ident nCount_RNA nFeature_RNA sdimx sdimy ## cell_1276 group1 266 51 -11933.19 1366.934 ## cell_8890 group1 13 3 10380.81 1456.934 ## cell_691 group1 132 36 -13443.19 1486.934 ## cell_396 group1 95 27 -14053.19 1425.934 ## cell_9818 group1 10 5 12747.81 1436.934 ## cell_11310 group1 15 5 16333.81 1400.934 ``` r table(seu$orig.ident) ``` ## ## group1 group2 group3 ## 1000 1000 1000 ``` r # Run BANKSY seu = RunBanksy(seu, lambda = 0.2, assay = 'RNA', slot = 'data', dimx = 'sdimx', dimy = 'sdimy', features = 'all', group = 'orig.ident', split.scale = TRUE, k_geom = 15) # Staggered locations added to metadata head(seu@meta.data) ``` ## orig.ident nCount_RNA nFeature_RNA sdimx sdimy ## cell_1276 group1 266 51 -11933.19 1366.934 ## cell_8890 group1 13 3 10380.81 1456.934 ## cell_691 group1 132 36 -13443.19 1486.934 ## cell_396 group1 95 27 -14053.19 1425.934 ## cell_9818 group1 10 5 12747.81 1436.934 ## cell_11310 group1 15 5 16333.81 1400.934 ## staggered_sdimx staggered_sdimy ## cell_1276 3728.686 1366.934 ## cell_8890 26042.686 1456.934 ## cell_691 2218.686 1486.934 ## cell_396 1608.686 1425.934 ## cell_9818 28409.686 1436.934 ## cell_11310 31995.686 1400.934 The rest of the workflow follows as before: ``` r seu = RunPCA(seu, assay = 'BANKSY', features = rownames(seu), npcs = 30) seu = RunUMAP(seu, dims = 1:30) seu = FindNeighbors(seu, dims = 1:30) seu = FindClusters(seu, resolution = 1) ``` ## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck ## ## Number of nodes: 3000 ## Number of edges: 171757 ## ## Running Louvain algorithm... ## Maximum modularity in 10 random starts: 0.8094 ## Number of communities: 12 ## Elapsed time: 0 seconds Visualise clusters: ``` r mypal <- kelly()[-1] DimPlot(seu, pt.size = 0.25, label = TRUE, label.size = 3, cols = mypal) ``` ``` r FeatureScatter(seu, 'staggered_sdimx', 'staggered_sdimy', pt.size = 0.75, cols = mypal) ``` ## Spatial data integration with Harmony BANKSY can be used with Harmony for integrating multiple spatial omics datasets in the presence of strong batch effects. Download the data. ``` r library(spatialLIBD) library(ExperimentHub) library(harmony) ehub <- ExperimentHub::ExperimentHub() spe <- spatialLIBD::fetch_data(type = "spe", eh = ehub) imgData(spe) <- NULL assay(spe, "logcounts") <- NULL reducedDims(spe) <- NULL rowData(spe) <- NULL colData(spe) <- DataFrame( sample_id = spe$sample_id, clust_annotation = factor( addNA(spe$layer_guess_reordered_short), exclude = NULL, labels = seq(8) ), in_tissue = spe$in_tissue, row.names = colnames(spe) ) invisible(gc()) # Subset to first sample of each subject sample_names <- c("151507", "151669", "151673") spe_list <- lapply(sample_names, function(x) spe[, spe$sample_id == x]) rm(spe) invisible(gc()) ``` Normalise the data and compute highly variable features. ``` r # Convert to Seurat and Normalize data seu_list <- lapply(spe_list, function(x) { x <- as.Seurat(x, data = NULL) NormalizeData(x, scale.factor = 3000, normalization.method = 'RC') }) # Compute HVGs for each dataset and take the union hvgs <- lapply(seu_list, function(x) { VariableFeatures(FindVariableFeatures(x, nfeatures = 2000)) }) hvgs <- Reduce(union, hvgs) # Subset to HVGs seu_list <- lapply(seu_list, function(x) x[hvgs,]) seu <- Reduce(merge, seu_list) locs <- do.call(rbind.data.frame, lapply(spe_list, spatialCoords)) seu@meta.data <- cbind(seu@meta.data, locs) seu ``` Run BANKSY. When analysing multiple samples, the argument `group` must be provided, which specifies the name of the metadata column that gives the assignment of each cell or spot to its original Seurat object. Here, we use `sample_id`. Internally, providing the `group` argument tells the function to compute neighborhood matrices based on locations staggered by `group`, ensuring that cells from different spatial datasets do not overlap. The staggered locations are stored in the metadata for sanity checking. Within-group scaling has little effect in the presence of strong batch effects, hence, we set `split.scale=FALSE` for efficiency. ``` r # Grouping variable head(seu@meta.data) table(seu$sample_id) sdimx <- 'pxl_col_in_fullres' sdimy <- 'pxl_row_in_fullres' # Run BANKSY seu <- RunBanksy(seu, lambda = 0.2, assay = 'originalexp', slot = 'data', dimx = sdimx, dimy = sdimy, features = 'all', group = 'sample_id', split.scale = FALSE, k_geom = 6) ``` Compute a spatially-aware embedding with PCA on the BANKSY matrix, and run Harmony on this embedding. ``` r seu <- RunPCA(seu, assay = 'BANKSY', features = rownames(seu), npcs = 10) seu <- RunHarmony(seu, group.by.vars='sample_id') ``` The rest of the workflow follows as before: ``` r seu <- RunUMAP(seu, dims = 1:10, reduction = 'harmony') seu <- FindNeighbors(seu, dims = 1:10, reduction = 'harmony') seu <- FindClusters(seu, resolution = 0.4) ``` Visualise clusters: ``` r DimPlot(seu, pt.size = 0.25, label = TRUE, label.size = 3, cols = mypal) FeatureScatter(seu, 'staggered_sdimx', 'staggered_sdimy', cols = mypal, pt.size = 0.75) ``` ## Getting help For more information, visit .
Vignette runtime ## Time difference of 1.434785 mins
Session info ``` r sessionInfo() ``` ## R version 4.3.2 (2023-10-31) ## Platform: aarch64-apple-darwin20 (64-bit) ## Running under: macOS Sonoma 14.2.1 ## ## Matrix products: default ## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib ## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0 ## ## locale: ## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 ## ## time zone: Europe/London ## tzcode source: internal ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] pals_1.8 gridExtra_2.3 ggplot2_3.4.4 ## [4] SeuratWrappers_0.3.4 ssHippo.SeuratData_3.1.4 SeuratData_0.2.2.9001 ## [7] Seurat_5.0.1 SeuratObject_5.0.1 sp_2.1-3 ## [10] Banksy_0.99.9 ## ## loaded via a namespace (and not attached): ## [1] RcppHungarian_0.3 RcppAnnoy_0.0.22 ## [3] splines_4.3.2 later_1.3.2 ## [5] bitops_1.0-7 tibble_3.2.1 ## [7] R.oo_1.26.0 polyclip_1.10-6 ## [9] fastDummies_1.7.3 lifecycle_1.0.4 ## [11] aricode_1.0.3 globals_0.16.2 ## [13] lattice_0.22-5 MASS_7.3-60.0.1 ## [15] magrittr_2.0.3 limma_3.58.1 ## [17] plotly_4.10.4 rmarkdown_2.25 ## [19] yaml_2.3.8 remotes_2.4.2.1 ## [21] httpuv_1.6.14 sctransform_0.4.1 ## [23] spam_2.10-0 spatstat.sparse_3.0-3 ## [25] reticulate_1.35.0 mapproj_1.2.11 ## [27] cowplot_1.1.3 pbapply_1.7-2 ## [29] RColorBrewer_1.1-3 maps_3.4.2 ## [31] abind_1.4-5 zlibbioc_1.48.0 ## [33] Rtsne_0.17 GenomicRanges_1.54.1 ## [35] purrr_1.0.2 R.utils_2.12.3 ## [37] BiocGenerics_0.48.1 RCurl_1.98-1.14 ## [39] rappdirs_0.3.3 GenomeInfoDbData_1.2.11 ## [41] IRanges_2.36.0 S4Vectors_0.40.2 ## [43] ggrepel_0.9.5 irlba_2.3.5.1 ## [45] listenv_0.9.1 spatstat.utils_3.0-4 ## [47] goftest_1.2-3 RSpectra_0.16-1 ## [49] spatstat.random_3.2-2 fitdistrplus_1.1-11 ## [51] parallelly_1.37.0 leiden_0.4.3.1 ## [53] codetools_0.2-19 DelayedArray_0.28.0 ## [55] tidyselect_1.2.0 farver_2.1.1 ## [57] matrixStats_1.2.0 stats4_4.3.2 ## [59] spatstat.explore_3.2-6 jsonlite_1.8.8 ## [61] ellipsis_0.3.2 progressr_0.14.0 ## [63] ggridges_0.5.6 survival_3.5-7 ## [65] dbscan_1.1-12 tools_4.3.2 ## [67] ica_1.0-3 Rcpp_1.0.12 ## [69] glue_1.7.0 SparseArray_1.2.4 ## [71] xfun_0.42 MatrixGenerics_1.14.0 ## [73] GenomeInfoDb_1.38.6 dplyr_1.1.4 ## [75] withr_3.0.0 BiocManager_1.30.22 ## [77] fastmap_1.1.1 fansi_1.0.6 ## [79] digest_0.6.34 rsvd_1.0.5 ## [81] R6_2.5.1 mime_0.12 ## [83] colorspace_2.1-0 scattermore_1.2 ## [85] sccore_1.0.4 tensor_1.5 ## [87] dichromat_2.0-0.1 spatstat.data_3.0-4 ## [89] R.methodsS3_1.8.2 utf8_1.2.4 ## [91] tidyr_1.3.1 generics_0.1.3 ## [93] data.table_1.15.0 httr_1.4.7 ## [95] htmlwidgets_1.6.4 S4Arrays_1.2.0 ## [97] uwot_0.1.16 pkgconfig_2.0.3 ## [99] gtable_0.3.4 lmtest_0.9-40 ## [101] SingleCellExperiment_1.24.0 XVector_0.42.0 ## [103] htmltools_0.5.7 dotCall64_1.1-1 ## [105] scales_1.3.0 Biobase_2.62.0 ## [107] png_0.1-8 SpatialExperiment_1.12.0 ## [109] knitr_1.45 rstudioapi_0.15.0 ## [111] reshape2_1.4.4 rjson_0.2.21 ## [113] nlme_3.1-164 zoo_1.8-12 ## [115] stringr_1.5.1 KernSmooth_2.23-22 ## [117] parallel_4.3.2 miniUI_0.1.1.1 ## [119] pillar_1.9.0 grid_4.3.2 ## [121] vctrs_0.6.5 RANN_2.6.1 ## [123] promises_1.2.1 xtable_1.8-4 ## [125] cluster_2.1.6 evaluate_0.23 ## [127] magick_2.8.2 cli_3.6.2 ## [129] compiler_4.3.2 rlang_1.1.3 ## [131] crayon_1.5.2 future.apply_1.11.1 ## [133] labeling_0.4.3 mclust_6.0.1 ## [135] plyr_1.8.9 stringi_1.8.3 ## [137] viridisLite_0.4.2 deldir_2.0-2 ## [139] munsell_0.5.0 lazyeval_0.2.2 ## [141] spatstat.geom_3.2-8 Matrix_1.6-5 ## [143] RcppHNSW_0.6.0 patchwork_1.2.0 ## [145] future_1.33.1 statmod_1.5.0 ## [147] shiny_1.8.0 highr_0.10 ## [149] SummarizedExperiment_1.32.0 ROCR_1.0-11 ## [151] leidenAlg_1.1.2 igraph_2.0.1.1
================================================ FILE: docs/cellbrowser.Rmd ================================================ --- title: "Data Exploration with the UCSC Cell Browser" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: github_document: html_preview: false toc: false html_document: df_print: kable theme: united --- This vignette demonstrates how to run launch a UCSC Cell Browser instance populated with data from a Seurat object. If you use cell browser, please cite: > *UCSC Single Cell Browser* > > Maximilian Haeussler, Nikolay Markov, Brian Raney, and Lucas Seninge > Documentation: https://cellbrowser.readthedocs.io ```{r setup, include=FALSE} knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, fig.height = 20, fig.width = 16 ) ``` Prerequisites to install: * [Seurat](https://satijalab.org/seurat/install) * [UCSC Cell Browser](https://pypi.org/project/cellbrowser/) * [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) * [SeuratData](https://github.com/satijalab/seurat-data) ```{r packages} library(Seurat) library(SeuratData) library(SeuratWrappers) ``` ### Viewing pbmc3k in a cell browser ```{r cellbrowser, eval=FALSE} InstallData("pbmc3k") pbmc3k <- LoadData("pbmc3k", type = "pbmc3k.final") ExportToCellbrowser(pbmc3k, dir = "out", cb.dir = "cb_out", port = 8080, reductions = "umap") ``` ![](cellbrowser.png) ```{r stopcellbrowser, eval=FALSE} # Remember to stop your cell browser instance when done StopCellbrowser() ``` ================================================ FILE: docs/cellbrowser.html ================================================ Data Exploration with the UCSC Cell Browser

This vignette demonstrates how to run launch a UCSC Cell Browser instance populated with data from a Seurat object. If you use cell browser, please cite:

UCSC Single Cell Browser

Maximilian Haeussler, Nikolay Markov, Brian Raney, and Lucas Seninge

Documentation: https://cellbrowser.readthedocs.io

Prerequisites to install:

library(Seurat)
library(SeuratData)
library(SeuratWrappers)

Viewing pbmc3k in a cell browser

InstallData("pbmc3k")
pbmc3k <- LoadData("pbmc3k", type = "pbmc3k.final")
ExportToCellbrowser(pbmc3k, dir = "out", cb.dir = "cb_out", port = 8080, reductions = "umap")

# Remember to stop your cell browser instance when done
StopCellbrowser()
================================================ FILE: docs/cellbrowser.md ================================================ Data Exploration with the UCSC Cell Browser ================ Compiled: August 31, 2020 This vignette demonstrates how to run launch a UCSC Cell Browser instance populated with data from a Seurat object. If you use cell browser, please cite: > *UCSC Single Cell Browser* > > Maximilian Haeussler, Nikolay Markov, Brian Raney, and Lucas Seninge > Documentation: Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [UCSC Cell Browser](https://pypi.org/project/cellbrowser/) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) - [SeuratData](https://github.com/satijalab/seurat-data) ``` r library(Seurat) library(SeuratData) library(SeuratWrappers) ``` ### Viewing pbmc3k in a cell browser ``` r InstallData("pbmc3k") pbmc3k <- LoadData("pbmc3k", type = "pbmc3k.final") ExportToCellbrowser(pbmc3k, dir = "out", cb.dir = "cb_out", port = 8080, reductions = "umap") ``` ![](cellbrowser.png) ``` r # Remember to stop your cell browser instance when done StopCellbrowser() ``` ================================================ FILE: docs/cipr.Rmd ================================================ --- title: "Using CIPR with human PBMC data" date: "`r format(Sys.time(), '%d %B, %Y')`" author: "Atakan Ekiz" output: github_document: html_preview: true toc: false html_document: df_print: kable theme: united --- This vignette demonstrates how to run CIPR on Seurat objects. If you use CIPR, please cite: > *CIPR: a web-based R/shiny app and R package to annotate cell clusters in single cell RNA sequencing experiments* > > H. Atakan Ekiz, Christopher J. Conley, W. Zac Stephens & Ryan M. O'Connell > > BMC Bioinformatics, 2020. > > doi: [10.1186/s12859-020-3538-2](https://doi.org/10.1186/s12859-020-3538-2) > > Github: https://github.com/atakanekiz/CIPR-Package ```{r, include=FALSE} knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE ) remotes::install_github("atakanekiz/CIPR-Package") ``` # Summary This vignette describes how to use CIPR package with 3k PBMC data freely available from 10X genomics. Here, we recycle the code described in [Seurat's guided clustering tutorial](https://satijalab.org/seurat/v3.1/pbmc3k_tutorial.html) to help users perform analyses from scratch. Using this dataset we will demonstrate the capabilities of CIPR to annotate single cell clusters in single cell RNAseq (scRNAseq) experiments. For further information about other clustering methods, please see Seurat's comprehensive [website](https://satijalab.org/seurat/) # Install CIPR ```{r, eval=F} if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools") # Use this option if you want to build vignettes during installation # This can take a long time due to the installation of suggested packages. remotes::install_github("atakanekiz/CIPR-Package", build_vignettes = TRUE) # Use this if you would like to install the package without vignettes # remotes::install_github("atakanekiz/CIPR-Package") ``` # Seurat pipeline ## Setup Seurat object ```{r, eval=T} library(dplyr) library(Seurat) library(SeuratData) library(CIPR) ``` ```{r} # Load data InstallData("pbmc3k") pbmc <- pbmc3k ``` ## Pre-processing The steps below encompass the standard pre-processing workflow for scRNA-seq data in Seurat. These represent the selection and filtration of cells based on QC metrics, data normalization and scaling, and the detection of highly variable features. ```{r} # Calculate mitochondrial gene representation (indicative of low quality cells) pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") # Filter out genes with feature counts outside of 200-2500 range, and >5% mt genes pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5) ``` ## Normalizing data ```{r, results="hide", message=F} pbmc <- NormalizeData(pbmc) ``` ## Variable gene detection and scaling ```{r, results="hide", message=F} pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000) all.genes <- rownames(pbmc) pbmc <- ScaleData(pbmc, features = all.genes) ``` ## Perform PCA ```{r, results="hide", message=F} pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) ``` ```{r, eval=T} ElbowPlot(pbmc) ``` ## Cluster cells ```{r, results="hide", message=F} pbmc <- FindNeighbors(pbmc, dims = 1:10) pbmc <- FindClusters(pbmc, resolution = 0.5) ``` ## Run non-linear dimensionality reduction (tSNE) ```{r, results="hide", message=F} pbmc <- RunTSNE(pbmc, dims = 1:10) pbmc$unnamed_clusters <- Idents(pbmc) ``` ```{r} # saveRDS(pbmc, "pbmc.rds") ``` ## Find differentially expressed genes __This is the step where we generate the input for CIPR's log fold change (logFC) comparison methods.__ ```{r, echo=F, results="hide"} allmarkers <- FindAllMarkers(pbmc) ``` ## Calculate average gene expression per cluster __This is the step where we generate the input for CIPR's all-genes correlation methods.__ ```{r, results="hide"} avgexp <- AverageExpression(pbmc) avgexp <- as.data.frame(x = avgexp$RNA) avgexp$gene <- rownames(avgexp) ``` ## Visualize Seurat pbject ```{r} DimPlot(pbmc) ``` # CIPR analysis The user can select one of the 7 provided reference data sets: | Reference | `reference` argument | |-------------------------------------------|----------------------| | Immunological Genome Project (ImmGen) | "immgen" | | Presorted cell RNAseq (various tissues) | "mmrnaseq" | | Blueprint/ENCODE | "blueprint" | | Human Primary Cell Atlas | "hpca" | | Database of Immune Cell Expression (DICE) | "dice" | | Hematopoietic differentiation | "hema" | | Presorted cell RNAseq (PBMC) | "hsrnaseq" | | User-provided custom reference | "custom" | ## Standard logFC comparison method In this method CIPR accepts `allmarkers` data frame created above and performs the following analytical steps: - It calculates a vector of logFC values for each reference sample (i.e. individual columns of the reference data frame) by comparing log-normalized expression value of a gene (i.e. rows of the reference data frame) to the average gene expression across the entire reference dataset. - It then scores unknown cluster logFC differential gene expression data against this reference logFC values to create a vector of identity scores - User can select one of three methods: - LogFC dot product (sum of all logFC x logFC values among matching genes). This is the recommended method in CIPR. - LogFC Spearman's correlation (rank correlation of logFC values) - LogFC Pearson's correlation (linear correlation of logFC values) ### Plot all identity scores per cluster-reference cell pairs The code below performs analysis using sorted human PBMC RNAseq data as reference, and plots CIPR results can be summarized for each cluster in scatter plots. ```{r, eval=T, fig.width=16, fig.height=32, message=F} CIPR(input_dat = allmarkers, comp_method = "logfc_dot_product", reference = "hsrnaseq", plot_ind = T, plot_top = F, global_results_obj = T, global_plot_obj = T, # axis.text.x=element_text(color="red") # arguments to pass to ggplot2::theme() to change plotting parameters ) ``` ### Plot identity scores for a select cluster `ind_clu_plots` object is created in the global environment to help users can visualize results for a desired cluster and manipulate graphing parameters. ggplot2 functions can be iteratively added to individual plots to create annotations etc. ```{r, eval=T, fig.width=16, fig.height=5, message=F} library(ggplot2) ind_clu_plots$cluster6 + theme(axis.text.y = element_text(color="red"), axis.text.x = element_text(color="blue")) + labs(fill="Reference")+ ggtitle("Figure S4a. Automated cluster annotation results are shown for cluster 6") + annotate("text", label="2 sd range", x=10, y= 700, size=8, color = "steelblue")+ annotate("text", label= "1 sd range", x=10, y=200, size=8, color ="orange2")+ geom_rect(aes(xmin=94, xmax=99, ymin=1000, ymax=1300), fill=NA, size=3, color="red") ``` ### Plot top scoring refernce subsets for each cluster ```{r, eval=T, fig.width=8, fig.height=4.5, message=F} CIPR(input_dat = allmarkers, comp_method = "logfc_dot_product", reference = "hsrnaseq", plot_ind = F, plot_top = T, global_results_obj = T, global_plot_obj = T) ``` ### Tabulate CIPR results CIPR results (both top 5 scoring reference types per cluster and the entire analysis) are saved as global objects (`CIPR_top_results` and `CIPR_all_results` respectively) to allow users to explore the outputs and generate specific plots and tables. ```{r} head(CIPR_top_results) head(CIPR_all_results) ``` ## Standard all-genes correlation method CIPR also implements a simple correlation approach in which overall correlation in gene expression is calculated for the pairs of unknown clusters and the reference samples (regardless of the differential expression status of the gene). This approach is conceptually similar to some other automated identity prediction pipelines such as [SingleR](https://www.ncbi.nlm.nih.gov/pubmed/30643263) and [scMCA](https://www.ncbi.nlm.nih.gov/pubmed/30758821). - __Spearman's correlation:__ It calculates correlation based on ranked gene expression. It can be suitable for comparing experimental and reference data which were obtained using different technologies. - __Pearson's correlation:__ It calculates linear correlations. This can be useful when the user would like to provide a custom reference dataset to CIPR. ### Plot all identity scores per cluster-reference cell pairs The code below performs analysis using sorted human PBMC RNAseq data as reference, and plots CIPR results can be summarized for each cluster in scatter plots. ```{r, eval=T, fig.width=16, fig.height=32, message=F} CIPR(input_dat = avgexp, comp_method = "all_genes_spearman", reference = "hsrnaseq", plot_ind = T, plot_top = F, global_results_obj = T, global_plot_obj = T) ``` ### Plot top scoring refernce subsets for each cluster ```{r, eval=T, fig.width=8, fig.height=4.5, message=F} CIPR(input_dat = avgexp, comp_method = "all_genes_spearman", reference = "hsrnaseq", plot_ind = F, plot_top = T, global_results_obj = T, global_plot_obj = T) ``` ### Tabulate CIPR results CIPR results (both top 5 scoring reference types per cluster and the entire analysis) are saved as global objects (`CIPR_top_results` and `CIPR_all_results` respectively) to allow users to explore the outputs and generate specific plots and tables. ```{r} head(CIPR_top_results) head(CIPR_all_results) ``` ## Limiting analysis to the select subsets of reference data Sometimes excluding irrelevant reference cell types from the analysis can be helpful. Especially when the logFC comparison methods are utilized, removing irrelevant subsets may improve discrimination of closely related subsets, since the reference logFC values will be calculated after subsetting the data frame. Filtering out reference subsets should not impact results of the all-genes correlation methods, but it can make the graphical outputs easier to look at 3k PBMC dataset may not be the best example to demonstrate benefits of reference dataset subsetting, but the code below serves as an example for this functionality. ```{r, eval=T, fig.width=16, fig.height=32, message=F} CIPR(input_dat = allmarkers, comp_method = "logfc_dot_product", reference = "hsrnaseq", plot_ind = T, plot_top = F, global_results_obj = T, global_plot_obj = T, select_ref_subsets = c("CD4+ T cell", "CD8+ T cell", "Monocyte", "NK cell")) ``` ## Filtering out lowly variable genes Genes that have a low expression variance across the reference data frame has weaker discriminatory potential. Thus, excluding these genes from the analysis can reduce the noise and improve the prediction scores, especially when using all-genes correlation based methods. We implemented a variance filtering parameter, `keep_top_var`, which allows users to keep top Nth% variable reference genes in the analysis. For instance, by setting this argument to 10, CIPR can be instructed to use only the top 10% highly variable genes in identity score calculations. In our experience _(Ekiz HA, BMC Bioinformatics, in revision)_ limiting the analysis to highly variable genes does not significantly impact the identity scores of the top-scoring reference cell subsets, but it reduces the identity scores of intermediate/low-scoring reference cells leading to an improvement of z-scores. The "best" value for this parameter remains to be determined by the user in individual studies. ```{r, eval=T, fig.width=16, fig.height=32, message=F} CIPR(input_dat = avgexp, comp_method = "all_genes_spearman", reference = "hsrnaseq", plot_ind = T, plot_top = F, global_results_obj = T, global_plot_obj = T, keep_top_var = 10) ```
**Session Info** ```{r} sessionInfo() ```
================================================ FILE: docs/cipr.html ================================================ [File too large to display: 10.5 MB] ================================================ FILE: docs/cipr.md ================================================ Using CIPR with human PBMC data ================ Atakan Ekiz 19 May, 2021 This vignette demonstrates how to run CIPR on Seurat objects. If you use CIPR, please cite: > *CIPR: a web-based R/shiny app and R package to annotate cell clusters > in single cell RNA sequencing experiments* > > H. Atakan Ekiz, Christopher J. Conley, W. Zac Stephens & Ryan M. > O’Connell > > BMC Bioinformatics, 2020. > > doi: > [10.1186/s12859-020-3538-2](https://doi.org/10.1186/s12859-020-3538-2) > > Github: # Summary This vignette describes how to use CIPR package with 3k PBMC data freely available from 10X genomics. Here, we recycle the code described in [Seurat’s guided clustering tutorial](https://satijalab.org/seurat/v3.1/pbmc3k_tutorial.html) to help users perform analyses from scratch. Using this dataset we will demonstrate the capabilities of CIPR to annotate single cell clusters in single cell RNAseq (scRNAseq) experiments. For further information about other clustering methods, please see Seurat’s comprehensive [website](https://satijalab.org/seurat/) # Install CIPR ``` r if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools") # Use this option if you want to build vignettes during installation This can take a long time # due to the installation of suggested packages. remotes::install_github("atakanekiz/CIPR-Package", build_vignettes = TRUE) # Use this if you would like to install the package without vignettes # remotes::install_github('atakanekiz/CIPR-Package') ``` # Seurat pipeline ## Setup Seurat object ``` r library(dplyr) library(Seurat) library(SeuratData) library(CIPR) ``` ``` r # Load data InstallData("pbmc3k") pbmc <- pbmc3k ``` ## Pre-processing The steps below encompass the standard pre-processing workflow for scRNA-seq data in Seurat. These represent the selection and filtration of cells based on QC metrics, data normalization and scaling, and the detection of highly variable features. ``` r # Calculate mitochondrial gene representation (indicative of low quality cells) pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") # Filter out genes with feature counts outside of 200-2500 range, and >5% mt genes pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5) ``` ## Normalizing data ``` r pbmc <- NormalizeData(pbmc) ``` ## Variable gene detection and scaling ``` r pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000) all.genes <- rownames(pbmc) pbmc <- ScaleData(pbmc, features = all.genes) ``` ## Perform PCA ``` r pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) ``` ``` r ElbowPlot(pbmc) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/cipr_files/figure-gfm/unnamed-chunk-9-1.png) ## Cluster cells ``` r pbmc <- FindNeighbors(pbmc, dims = 1:10) pbmc <- FindClusters(pbmc, resolution = 0.5) ``` ## Run non-linear dimensionality reduction (tSNE) ``` r pbmc <- RunTSNE(pbmc, dims = 1:10) pbmc$unnamed_clusters <- Idents(pbmc) ``` ``` r # saveRDS(pbmc, 'pbmc.rds') ``` ## Find differentially expressed genes **This is the step where we generate the input for CIPR’s log fold change (logFC) comparison methods.** ## Calculate average gene expression per cluster **This is the step where we generate the input for CIPR’s all-genes correlation methods.** ``` r avgexp <- AverageExpression(pbmc) avgexp <- as.data.frame(x = avgexp$RNA) avgexp$gene <- rownames(avgexp) ``` ## Visualize Seurat pbject ``` r DimPlot(pbmc) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/cipr_files/figure-gfm/unnamed-chunk-15-1.png) # CIPR analysis The user can select one of the 7 provided reference data sets: | Reference | `reference` argument | | ----------------------------------------- | -------------------- | | Immunological Genome Project (ImmGen) | “immgen” | | Presorted cell RNAseq (various tissues) | “mmrnaseq” | | Blueprint/ENCODE | “blueprint” | | Human Primary Cell Atlas | “hpca” | | Database of Immune Cell Expression (DICE) | “dice” | | Hematopoietic differentiation | “hema” | | Presorted cell RNAseq (PBMC) | “hsrnaseq” | | User-provided custom reference | “custom” | ## Standard logFC comparison method In this method CIPR accepts `allmarkers` data frame created above and performs the following analytical steps: - It calculates a vector of logFC values for each reference sample (i.e. individual columns of the reference data frame) by comparing log-normalized expression value of a gene (i.e. rows of the reference data frame) to the average gene expression across the entire reference dataset. - It then scores unknown cluster logFC differential gene expression data against this reference logFC values to create a vector of identity scores - User can select one of three methods: - LogFC dot product (sum of all logFC x logFC values among matching genes). This is the recommended method in CIPR. - LogFC Spearman’s correlation (rank correlation of logFC values) - LogFC Pearson’s correlation (linear correlation of logFC values) ### Plot all identity scores per cluster-reference cell pairs The code below performs analysis using sorted human PBMC RNAseq data as reference, and plots CIPR results can be summarized for each cluster in scatter plots. ``` r CIPR(input_dat = allmarkers, comp_method = "logfc_dot_product", reference = "hsrnaseq", plot_ind = T, plot_top = F, global_results_obj = T, global_plot_obj = T, # axis.text.x=element_text(color="red") # arguments to pass to ggplot2::theme() to change plotting parameters ) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/cipr_files/figure-gfm/unnamed-chunk-16-1.png) ### Plot identity scores for a select cluster `ind_clu_plots` object is created in the global environment to help users can visualize results for a desired cluster and manipulate graphing parameters. ggplot2 functions can be iteratively added to individual plots to create annotations etc. ``` r library(ggplot2) ind_clu_plots$cluster6 + theme(axis.text.y = element_text(color = "red"), axis.text.x = element_text(color = "blue")) + labs(fill = "Reference") + ggtitle("Figure S4a. Automated cluster annotation results are shown for cluster 6") + annotate("text", label = "2 sd range", x = 10, y = 700, size = 8, color = "steelblue") + annotate("text", label = "1 sd range", x = 10, y = 200, size = 8, color = "orange2") + geom_rect(aes(xmin = 94, xmax = 99, ymin = 1000, ymax = 1300), fill = NA, size = 3, color = "red") ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/cipr_files/figure-gfm/unnamed-chunk-17-1.png) ### Plot top scoring refernce subsets for each cluster ``` r CIPR(input_dat = allmarkers, comp_method = "logfc_dot_product", reference = "hsrnaseq", plot_ind = F, plot_top = T, global_results_obj = T, global_plot_obj = T) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/cipr_files/figure-gfm/unnamed-chunk-18-1.png) ### Tabulate CIPR results CIPR results (both top 5 scoring reference types per cluster and the entire analysis) are saved as global objects (`CIPR_top_results` and `CIPR_all_results` respectively) to allow users to explore the outputs and generate specific plots and tables. ``` r head(CIPR_top_results) ``` ## # A tibble: 6 x 9 ## # Groups: cluster [2] ## cluster reference_cell_t… reference_id long_name description identity_score ## ## 1 0 CD8+ T cell G4YW_CD8_nai… Naive CD8 … N/A 838. ## 2 0 CD8+ T cell DZQV_CD8_nai… Naive CD8 … N/A 833. ## 3 0 CD8+ T cell 925L_CD8_nai… Naive CD8 … N/A 779. ## 4 0 CD8+ T cell 9JD4_CD8_nai… Naive CD8 … N/A 751. ## 5 0 CD4+ T cell 9JD4_CD4_nai… Naive CD4 … N/A 743. ## 6 1 Monocyte G4YW_C_mono Classical … N/A 2031. ## # … with 3 more variables: index , z_score , ## # percent_pos_correlation ``` r head(CIPR_all_results) ``` ## reference_id identity_score reference_cell_type ## 1 DZQV_B_naive -506.4224 B cell ## 2 DZQV_B_NSM -414.3927 B cell ## 3 DZQV_B_Ex -438.5500 B cell ## 4 DZQV_B_SM -441.4376 B cell ## 5 DZQV_Plasmablasts 226.2113 B cell ## 6 925L_B_naive -128.9296 B cell ## long_name description cluster z_score ## 1 Naive B cells N/A 0 -1.0021725 ## 2 Non-switched memory B cells N/A 0 -0.8200527 ## 3 Exhausted B cells N/A 0 -0.8678580 ## 4 Switched memory B cells N/A 0 -0.8735724 ## 5 Plasmablasts N/A 0 0.4476555 ## 6 Naive B cells N/A 0 -0.2551421 ## percent_pos_correlation ## 1 42.16336 ## 2 43.37748 ## 3 43.92936 ## 4 41.39073 ## 5 64.56954 ## 6 61.92053 ## Standard all-genes correlation method CIPR also implements a simple correlation approach in which overall correlation in gene expression is calculated for the pairs of unknown clusters and the reference samples (regardless of the differential expression status of the gene). This approach is conceptually similar to some other automated identity prediction pipelines such as [SingleR](https://www.ncbi.nlm.nih.gov/pubmed/30643263) and [scMCA](https://www.ncbi.nlm.nih.gov/pubmed/30758821). - **Spearman’s correlation:** It calculates correlation based on ranked gene expression. It can be suitable for comparing experimental and reference data which were obtained using different technologies. - **Pearson’s correlation:** It calculates linear correlations. This can be useful when the user would like to provide a custom reference dataset to CIPR. ### Plot all identity scores per cluster-reference cell pairs The code below performs analysis using sorted human PBMC RNAseq data as reference, and plots CIPR results can be summarized for each cluster in scatter plots. ``` r CIPR(input_dat = avgexp, comp_method = "all_genes_spearman", reference = "hsrnaseq", plot_ind = T, plot_top = F, global_results_obj = T, global_plot_obj = T) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/cipr_files/figure-gfm/unnamed-chunk-20-1.png) ### Plot top scoring refernce subsets for each cluster ``` r CIPR(input_dat = avgexp, comp_method = "all_genes_spearman", reference = "hsrnaseq", plot_ind = F, plot_top = T, global_results_obj = T, global_plot_obj = T) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/cipr_files/figure-gfm/unnamed-chunk-21-1.png) ### Tabulate CIPR results CIPR results (both top 5 scoring reference types per cluster and the entire analysis) are saved as global objects (`CIPR_top_results` and `CIPR_all_results` respectively) to allow users to explore the outputs and generate specific plots and tables. ``` r head(CIPR_top_results) ``` ## # A tibble: 6 x 8 ## # Groups: cluster [2] ## cluster reference_cell_t… reference_id long_name description identity_score ## ## 1 0 CD4+ T cell DZQV_CD4_na… Naive CD4 T… N/A 0.797 ## 2 0 CD4+ T cell 925L_TFH Follicular … N/A 0.793 ## 3 0 CD4+ T cell G4YW_Th1 Th1 cells N/A 0.788 ## 4 0 CD4+ T cell G4YW_Treg T regulator… N/A 0.786 ## 5 0 CD4+ T cell DZQV_Th17 Th17 cells N/A 0.780 ## 6 1 Monocyte G4YW_I_mono Intermediat… N/A 0.784 ## # … with 2 more variables: index , z_score ``` r head(CIPR_all_results) ``` ## reference_id identity_score reference_cell_type ## 1 DZQV_B_naive 0.6503197 B cell ## 2 DZQV_B_NSM 0.6480390 B cell ## 3 DZQV_B_Ex 0.6488979 B cell ## 4 DZQV_B_SM 0.6961983 B cell ## 5 DZQV_Plasmablasts 0.6816080 B cell ## 6 925L_B_naive 0.6421836 B cell ## long_name description cluster z_score ## 1 Naive B cells N/A 0 -0.636484838 ## 2 Non-switched memory B cells N/A 0 -0.668063024 ## 3 Exhausted B cells N/A 0 -0.656170534 ## 4 Switched memory B cells N/A 0 -0.001236968 ## 5 Plasmablasts N/A 0 -0.203258085 ## 6 Naive B cells N/A 0 -0.749138568 ## Limiting analysis to the select subsets of reference data Sometimes excluding irrelevant reference cell types from the analysis can be helpful. Especially when the logFC comparison methods are utilized, removing irrelevant subsets may improve discrimination of closely related subsets, since the reference logFC values will be calculated after subsetting the data frame. Filtering out reference subsets should not impact results of the all-genes correlation methods, but it can make the graphical outputs easier to look at 3k PBMC dataset may not be the best example to demonstrate benefits of reference dataset subsetting, but the code below serves as an example for this functionality. ``` r CIPR(input_dat = allmarkers, comp_method = "logfc_dot_product", reference = "hsrnaseq", plot_ind = T, plot_top = F, global_results_obj = T, global_plot_obj = T, select_ref_subsets = c("CD4+ T cell", "CD8+ T cell", "Monocyte", "NK cell")) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/cipr_files/figure-gfm/unnamed-chunk-23-1.png) ## Filtering out lowly variable genes Genes that have a low expression variance across the reference data frame has weaker discriminatory potential. Thus, excluding these genes from the analysis can reduce the noise and improve the prediction scores, especially when using all-genes correlation based methods. We implemented a variance filtering parameter, `keep_top_var`, which allows users to keep top Nth% variable reference genes in the analysis. For instance, by setting this argument to 10, CIPR can be instructed to use only the top 10% highly variable genes in identity score calculations. In our experience *(Ekiz HA, BMC Bioinformatics, in revision)* limiting the analysis to highly variable genes does not significantly impact the identity scores of the top-scoring reference cell subsets, but it reduces the identity scores of intermediate/low-scoring reference cells leading to an improvement of z-scores. The “best” value for this parameter remains to be determined by the user in individual studies. ``` r CIPR(input_dat = avgexp, comp_method = "all_genes_spearman", reference = "hsrnaseq", plot_ind = T, plot_top = F, global_results_obj = T, global_plot_obj = T, keep_top_var = 10) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/cipr_files/figure-gfm/unnamed-chunk-24-1.png)
**Session Info** ``` r sessionInfo() ``` ## R version 4.0.4 (2021-02-15) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 20.04 LTS ## ## Matrix products: default ## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so ## ## locale: ## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 ## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C ## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] ggplot2_3.3.3 pbmc3k.SeuratData_3.1.4 CIPR_0.1.0 ## [4] SeuratData_0.2.1 SeuratObject_4.0.1 Seurat_4.0.1 ## [7] dplyr_1.0.6 ## ## loaded via a namespace (and not attached): ## [1] readxl_1.3.1 backports_1.2.1 plyr_1.8.6 ## [4] igraph_1.2.6 lazyeval_0.2.2 splines_4.0.4 ## [7] listenv_0.8.0 scattermore_0.7 digest_0.6.27 ## [10] htmltools_0.5.1.1 fansi_0.4.2 magrittr_2.0.1 ## [13] tensor_1.5 cluster_2.1.0 ROCR_1.0-11 ## [16] openxlsx_4.2.3 limma_3.46.0 remotes_2.3.0 ## [19] globals_0.14.0 matrixStats_0.58.0 spatstat.sparse_2.0-0 ## [22] prettyunits_1.1.1 colorspace_2.0-1 rappdirs_0.3.3 ## [25] ggrepel_0.9.1 haven_2.4.1 xfun_0.23 ## [28] callr_3.7.0 crayon_1.4.1 jsonlite_1.7.2 ## [31] spatstat.data_2.1-0 survival_3.2-7 zoo_1.8-9 ## [34] glue_1.4.2 polyclip_1.10-0 gtable_0.3.0 ## [37] leiden_0.3.7 car_3.0-10 pkgbuild_1.2.0 ## [40] future.apply_1.7.0 abind_1.4-5 scales_1.1.1 ## [43] rstatix_0.7.0 miniUI_0.1.1.1 Rcpp_1.0.6 ## [46] viridisLite_0.4.0 xtable_1.8-4 reticulate_1.20 ## [49] spatstat.core_2.1-2 foreign_0.8-81 htmlwidgets_1.5.3 ## [52] httr_1.4.2 RColorBrewer_1.1-2 ellipsis_0.3.2 ## [55] ica_1.0-2 pkgconfig_2.0.3 farver_2.1.0 ## [58] uwot_0.1.10 deldir_0.2-10 utf8_1.2.1 ## [61] tidyselect_1.1.1 labeling_0.4.2 rlang_0.4.11 ## [64] reshape2_1.4.4 later_1.2.0 cellranger_1.1.0 ## [67] munsell_0.5.0 tools_4.0.4 cli_2.5.0 ## [70] generics_0.1.0 broom_0.7.6 ggridges_0.5.3 ## [73] evaluate_0.14 stringr_1.4.0 fastmap_1.1.0 ## [76] yaml_2.2.1 goftest_1.2-2 processx_3.5.2 ## [79] knitr_1.33 fitdistrplus_1.1-3 zip_2.1.1 ## [82] purrr_0.3.4 RANN_2.6.1 pbapply_1.4-3 ## [85] future_1.21.0 nlme_3.1-152 mime_0.10 ## [88] formatR_1.9 compiler_4.0.4 rstudioapi_0.13 ## [91] plotly_4.9.3 curl_4.3.1 png_0.1-7 ## [94] ggsignif_0.6.1 spatstat.utils_2.1-0 tibble_3.1.2 ## [97] stringi_1.6.2 highr_0.9 ps_1.6.0 ## [100] forcats_0.5.1 lattice_0.20-41 Matrix_1.3-3 ## [103] vctrs_0.3.8 pillar_1.6.1 lifecycle_1.0.0 ## [106] spatstat.geom_2.1-0 lmtest_0.9-38 RcppAnnoy_0.0.18 ## [109] data.table_1.14.0 cowplot_1.1.1 irlba_2.3.3 ## [112] httpuv_1.6.1 patchwork_1.1.1 R6_2.5.0 ## [115] promises_1.2.0.1 KernSmooth_2.23-18 gridExtra_2.3 ## [118] rio_0.5.26 parallelly_1.25.0 codetools_0.2-18 ## [121] MASS_7.3-53 gtools_3.8.2 rprojroot_2.0.2 ## [124] withr_2.4.2 sctransform_0.3.2 hms_1.1.0 ## [127] mgcv_1.8-33 parallel_4.0.4 grid_4.0.4 ## [130] rpart_4.1-15 tidyr_1.1.3 rmarkdown_2.8 ## [133] carData_3.0-4 Rtsne_0.15 ggpubr_0.4.0 ## [136] shiny_1.6.0
================================================ FILE: docs/cogaps.html ================================================

Running CoGAPS on Seurat Objects

Compiled: April 20, 2020

This vignette demonstrates the use of the CoGAPS package on Seurat objects.

Decomposing cell identity for transfer learning across cellular measurements, platforms, tissues, and species

Genevieve L. Stein-O’Brien, Brian S. Clark, Thomas Sherman, Cristina Zibetti, Qiwen Hu, Rachel Sealfon, Sheng Liu, Jiang Qian, Carlo Colantuoni, Seth Blackshaw, Loyal A.Goff, Elana J.Fertig

Cell Systems, 2019.

doi: 10.1016/j.cels.2019.04.004

Bioconductor: https://www.bioconductor.org/packages/release/bioc/html/CoGAPS.html

Prerequisites to install:

library(Seurat)
library(SeuratWrappers)
library(SeuratData)
library(CoGAPS)

Running CoGAPS with Seurat Data Using Cloud Computing

We suggest using a high number of iterations to get robust results when running CoGAPS. This will allow the algorithm to converge. When the system has converged, the results are fairly robust. 50,000 iterations were used in this example and the runtime was roughly five hours for each run (three patterns and ten patterns). We used Amazon Web Services, a Cloud Computing Service, to run CoGAPS. An example to run locally is featured later on.

Using CoGAPS to Identify Cell Lineage

AWS was used to run the below section of CoGAPS to look for three patterns

To learn more about this dataset, type ?pbmc3k

InstallData("pbmc3k")
data("pbmc3k.final")
params <- CogapsParams(singleCell = TRUE, sparseOptimization = TRUE, seed = 123, nIterations = 50000, 
    nPatterns = 3, distributed = "genome-wide")
params <- setDistributedParams(params, nSets = 5)
pbmc3k.final <- RunCoGAPS(pbmc3k.final, temp.file = TRUE, params = params)

The two major lineages of blood cells are categorized as either myeloid or lymphoid. This specialization requires transcriptional diversification during lineage commitment. There are specific genes related to each of these lineages. In our data, CoGAPS identifies distinct patterns that segregate cells by immune lineage as shown below.

Lymphoid Lineage

VlnPlot(pbmc3k.final, features = "CoGAPS_3")

Myeloid Lineage

VlnPlot(pbmc3k.final, features = "CoGAPS_1")

DimPlot(pbmc3k.final, reduction = "CoGAPS", pt.size = 0.5, dims = c(1, 3))

Using CoGAPS to Identify Cell Type

AWS was used to run the below section of CoGAPS to look for ten patterns

To learn more about this dataset, type ?pbmc3k

InstallData("pbmc3k")
data("pbmc3k.final")

params <- CogapsParams(singleCell = TRUE, sparseOptimization = TRUE, seed = 123, nIterations = 50000, 
    nPatterns = 10, distributed = "genome-wide")
params <- setDistributedParams(params, nSets = 5)
pbmc3k.final <- RunCoGAPS(object = pbmc3k.final, temp.file = TRUE, params = params)

Both the myeloid or lymphoid lineages give rise to many different cell types critical to the immune system. CoGAPS is able to discern cell type specific patterns, such as those shown below for DC (CoGAPS_3) and B (CoGAPS_4) cells. Importantly, CoGAPS is also able to identify phenotypic subtypes within a population of cells, such as FCGR3A+ Monocytes (CoGAPS_6).

DC Cells

VlnPlot(pbmc3k.final, features = "CoGAPS_3")

B Cells

VlnPlot(pbmc3k.final, features = "CoGAPS_4")

FCGR3A+ Monocytes

VlnPlot(pbmc3k.final, features = "CoGAPS_6")

DimPlot(pbmc3k.final, reduction = "CoGAPS", pt.size = 0.5, dims = c(3, 4))

DimPlot(pbmc3k.final, reduction = "CoGAPS", pt.size = 0.5, dims = c(3, 6))

DimPlot(pbmc3k.final, reduction = "CoGAPS", pt.size = 0.5, dims = c(4, 6))

Running CoGAPS with Seurat Data Locally

For example purposes, we will run locally using 5,000 iterations. Note: Results may be different because of complier dependence. Boost random number processor was used for this example.

To learn more about this dataset, type ?pbmc3k

InstallData("pbmc3k")
data("pbmc3k.final")
pbmc3k.final <- RunCoGAPS(object = pbmc3k.final, nPatterns = 3, nIterations = 5000, outputFrequency = 1000, 
    sparseOptimization = TRUE, nThreads = 1, distributed = "genome-wide", singleCell = TRUE, seed = 891)
DimPlot(pbmc3k.final, reduction = "CoGAPS", pt.size = 0.5, dims = c(3, 2))

Lymphoid Lineage

VlnPlot(pbmc3k.final, features = "CoGAPS_2")

Myeloid Lineage

VlnPlot(pbmc3k.final, features = "CoGAPS_3")

Additional Features of CoGAPS

Uncertainty Matrix

In addition to providing the data, the user can also specify an uncertainty measurement - the standard deviation of each entry in the data matrix. By default, CoGAPS assumes that the standard deviation matrix is 10% of the data matrix. This is a reasonable heuristic to use, but for specific types of data you may be able to provide better information. An uncertainty matrix can be specified using the uncertainty argument when running CoCAPS.

pbmc3k.final <- RunCoGAPS(pbmc3k.final, uncertainty = datMat.uncertainty, nPatterns = 10, nIterations = 100, 
    outputFrequency = 100, sparseOptimization = TRUE, nThreads = 1, singleCell = TRUE, distributed = "genome-wide")

Running CoGAPS in Parallel for Large Datasets

Non-Negative Matrix Factorization algorithms typically require long computation times and CoGAPS is no exception. The simplest way to run CoGAPS in parallel is to provide the nThreads argument when running CoGAPS. This allows the underlying algorithm to run on multiple threads and has no effect on the mathematics of the algorithm. For more information on running CoGAPS in parallel, visit CoGAPS Vignette.

pbmc3k.final <- RunCoGAPS(pbmc3k.final, nPatterns = 10, nIterations = 100, outputFrequency = 100, 
    sparseOptimization = TRUE, nThreads = 3, singleCell = TRUE, distributed = "genome-wide")

Resoures for CoGAPS

Visit the following resources to learn more about CoGAPS and running CoGAPS outside of the Seurat environment:

================================================ FILE: docs/cogaps.md ================================================ Running CoGAPS on Seurat Objects ================ Compiled: April 20, 2020 - [Running CoGAPS with Seurat Data Using Cloud Computing](#running-cogaps-with-seurat-data-using-cloud-computing) - [Using CoGAPS to Identify Cell Lineage](#using-cogaps-to-identify-cell-lineage) - [Using CoGAPS to Identify Cell Type](#using-cogaps-to-identify-cell-type) - [Running CoGAPS with Seurat Data Locally](#running-cogaps-with-seurat-data-locally) - [Lymphoid Lineage](#lymphoid-lineage-1) - [Myeloid Lineage](#myeloid-lineage-1) - [Additional Features of CoGAPS](#additional-features-of-cogaps) - [Resoures for CoGAPS](#resoures-for-cogaps) This vignette demonstrates the use of the CoGAPS package on Seurat objects. > *Decomposing cell identity for transfer learning across cellular measurements, platforms, tissues, and species* > > Genevieve L. Stein-O’Brien, Brian S. Clark, Thomas Sherman, Cristina Zibetti, Qiwen Hu, Rachel Sealfon, Sheng Liu, Jiang Qian, Carlo Colantuoni, Seth Blackshaw, Loyal A.Goff, Elana J.Fertig > > Cell Systems, 2019. > > doi: [10.1016/j.cels.2019.04.004](https://doi.org/10.1016/j.cels.2019.04.004) > > Bioconductor: Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) - [SeuratData](https://github.com/satijalab/seurat-data) - [CoGAPS](https://bioconductor.org/packages/release/bioc/vignettes/CoGAPS/inst/doc/CoGAPS.html#vignette-version) ``` r library(Seurat) library(SeuratWrappers) library(SeuratData) library(CoGAPS) ``` Running CoGAPS with Seurat Data Using Cloud Computing ===================================================== We suggest using a high number of iterations to get robust results when running CoGAPS. This will allow the algorithm to converge. When the system has converged, the results are fairly robust. 50,000 iterations were used in this example and the runtime was roughly five hours for each run (three patterns and ten patterns). We used Amazon Web Services, a Cloud Computing Service, to run CoGAPS. An example to run locally is featured later on. ### Using CoGAPS to Identify Cell Lineage *AWS was used to run the below section of CoGAPS to look for three patterns* To learn more about this dataset, type `?pbmc3k` ``` r InstallData("pbmc3k") data("pbmc3k.final") params <- CogapsParams(singleCell = TRUE, sparseOptimization = TRUE, seed = 123, nIterations = 50000, nPatterns = 3, distributed = "genome-wide") params <- setDistributedParams(params, nSets = 5) pbmc3k.final <- RunCoGAPS(pbmc3k.final, temp.file = TRUE, params = params) ``` The two major lineages of blood cells are categorized as either myeloid or lymphoid. This specialization requires transcriptional diversification during lineage commitment. There are specific genes related to each of these lineages. In our data, CoGAPS identifies distinct patterns that segregate cells by immune lineage as shown below. #### Lymphoid Lineage ``` r VlnPlot(pbmc3k.final, features = "CoGAPS_3") ``` ![](cogaps_files/figure-markdown_github/threePatternsLymphoidVlnPlot-1.png) #### Myeloid Lineage ``` r VlnPlot(pbmc3k.final, features = "CoGAPS_1") ``` ![](cogaps_files/figure-markdown_github/threePatternsMyeloidVlnPlot-1.png) ``` r DimPlot(pbmc3k.final, reduction = "CoGAPS", pt.size = 0.5, dims = c(1, 3)) ``` ![](cogaps_files/figure-markdown_github/threePatternsScatterPlot-1.png) ### Using CoGAPS to Identify Cell Type *AWS was used to run the below section of CoGAPS to look for ten patterns* To learn more about this dataset, type `?pbmc3k` ``` r InstallData("pbmc3k") data("pbmc3k.final") params <- CogapsParams(singleCell = TRUE, sparseOptimization = TRUE, seed = 123, nIterations = 50000, nPatterns = 10, distributed = "genome-wide") params <- setDistributedParams(params, nSets = 5) pbmc3k.final <- RunCoGAPS(object = pbmc3k.final, temp.file = TRUE, params = params) ``` Both the myeloid or lymphoid lineages give rise to many different cell types critical to the immune system. CoGAPS is able to discern cell type specific patterns, such as those shown below for DC (CoGAPS\_3) and B (CoGAPS\_4) cells. Importantly, CoGAPS is also able to identify phenotypic subtypes within a population of cells, such as FCGR3A+ Monocytes (CoGAPS\_6). #### DC Cells ``` r VlnPlot(pbmc3k.final, features = "CoGAPS_3") ``` ![](cogaps_files/figure-markdown_github/tenPatternsDcVlnPlot-1.png) #### B Cells ``` r VlnPlot(pbmc3k.final, features = "CoGAPS_4") ``` ![](cogaps_files/figure-markdown_github/tenPatternsBVlnPlot-1.png) #### FCGR3A+ Monocytes ``` r VlnPlot(pbmc3k.final, features = "CoGAPS_6") ``` ![](cogaps_files/figure-markdown_github/tenPatternsFcgr3aVlnPlot-1.png) ``` r DimPlot(pbmc3k.final, reduction = "CoGAPS", pt.size = 0.5, dims = c(3, 4)) ``` ![](cogaps_files/figure-markdown_github/tenPatternsScatterPlot-1.png) ``` r DimPlot(pbmc3k.final, reduction = "CoGAPS", pt.size = 0.5, dims = c(3, 6)) ``` ![](cogaps_files/figure-markdown_github/tenPatternsScatterPlot-2.png) ``` r DimPlot(pbmc3k.final, reduction = "CoGAPS", pt.size = 0.5, dims = c(4, 6)) ``` ![](cogaps_files/figure-markdown_github/tenPatternsScatterPlot-3.png) Running CoGAPS with Seurat Data Locally ======================================= For example purposes, we will run locally using 5,000 iterations. *Note:* Results may be different because of complier dependence. Boost random number processor was used for this example. To learn more about this dataset, type `?pbmc3k` ``` r InstallData("pbmc3k") data("pbmc3k.final") pbmc3k.final <- RunCoGAPS(object = pbmc3k.final, nPatterns = 3, nIterations = 5000, outputFrequency = 1000, sparseOptimization = TRUE, nThreads = 1, distributed = "genome-wide", singleCell = TRUE, seed = 891) ``` ``` r DimPlot(pbmc3k.final, reduction = "CoGAPS", pt.size = 0.5, dims = c(3, 2)) ``` ![](cogaps_files/figure-markdown_github/threePatternsExampleScatterPlot-1.png) #### Lymphoid Lineage ``` r VlnPlot(pbmc3k.final, features = "CoGAPS_2") ``` ![](cogaps_files/figure-markdown_github/threePatternsExampleLympVlnPlot-1.png) #### Myeloid Lineage ``` r VlnPlot(pbmc3k.final, features = "CoGAPS_3") ``` ![](cogaps_files/figure-markdown_github/threePatternsExampleMyeVlnPlot-1.png) ### Additional Features of CoGAPS #### Uncertainty Matrix In addition to providing the data, the user can also specify an uncertainty measurement - the standard deviation of each entry in the data matrix. By default, CoGAPS assumes that the standard deviation matrix is 10% of the data matrix. This is a reasonable heuristic to use, but for specific types of data you may be able to provide better information. An uncertainty matrix can be specified using the `uncertainty` argument when running CoCAPS. ``` r pbmc3k.final <- RunCoGAPS(pbmc3k.final, uncertainty = datMat.uncertainty, nPatterns = 10, nIterations = 100, outputFrequency = 100, sparseOptimization = TRUE, nThreads = 1, singleCell = TRUE, distributed = "genome-wide") ``` #### Running CoGAPS in Parallel for Large Datasets Non-Negative Matrix Factorization algorithms typically require long computation times and CoGAPS is no exception. The simplest way to run CoGAPS in parallel is to provide the `nThreads` argument when running CoGAPS. This allows the underlying algorithm to run on multiple threads and has no effect on the mathematics of the algorithm. For more information on running CoGAPS in parallel, visit [CoGAPS Vignette](https://bioconductor.org/packages/release/bioc/vignettes/CoGAPS/inst/doc/CoGAPS.html#vignette-version). ``` r pbmc3k.final <- RunCoGAPS(pbmc3k.final, nPatterns = 10, nIterations = 100, outputFrequency = 100, sparseOptimization = TRUE, nThreads = 3, singleCell = TRUE, distributed = "genome-wide") ``` ### Resoures for CoGAPS Visit the following resources to learn more about CoGAPS and running CoGAPS outside of the Seurat environment: - [CoGAPS Vignette](https://bioconductor.org/packages/release/bioc/vignettes/CoGAPS/inst/doc/CoGAPS.html#introduction) - [Wiki Link](https://github.com/FertigLab/CoGAPS/wiki) ================================================ FILE: docs/cogaps.rmd ================================================ --- title: "Running CoGAPS on Seurat Objects" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: html_document: df_print: kable theme: united highlight: null github_document: html_preview: true toc: true toc_depth: 3 --- This vignette demonstrates the use of the CoGAPS package on Seurat objects. > *Decomposing cell identity for transfer learning across cellular measurements, platforms, tissues, and species* > > Genevieve L. Stein-O’Brien, Brian S. Clark, Thomas Sherman, Cristina Zibetti, Qiwen Hu, Rachel Sealfon, Sheng Liu, Jiang Qian, Carlo Colantuoni, Seth Blackshaw, Loyal A.Goff, Elana J.Fertig > > Cell Systems, 2019. > > doi: [10.1016/j.cels.2019.04.004](https://doi.org/10.1016/j.cels.2019.04.004) > > Bioconductor: https://www.bioconductor.org/packages/release/bioc/html/CoGAPS.html ```{r setup, include=FALSE} knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE ) ``` Prerequisites to install: * [Seurat](https://satijalab.org/seurat/install) * [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) * [SeuratData](https://github.com/satijalab/seurat-data) * [CoGAPS](https://bioconductor.org/packages/release/bioc/vignettes/CoGAPS/inst/doc/CoGAPS.html#vignette-version) ```{r install.deps, include = FALSE} BiocManager::install("CoGAPS") ``` ```{r packages} library(Seurat) library(SeuratWrappers) library(SeuratData) library(CoGAPS) ``` # Running CoGAPS with Seurat Data Using Cloud Computing We suggest using a high number of iterations to get robust results when running CoGAPS. This will allow the algorithm to converge. When the system has converged, the results are fairly robust. 50,000 iterations were used in this example and the runtime was roughly five hours for each run (three patterns and ten patterns). We used Amazon Web Services, a Cloud Computing Service, to run CoGAPS. An example to run locally is featured later on. ### Using CoGAPS to Identify Cell Lineage *AWS was used to run the below section of CoGAPS to look for three patterns* To learn more about this dataset, type ```?pbmc3k``` ```{r threePatternsAWSCode, eval = FALSE} InstallData("pbmc3k") data("pbmc3k.final") params <- CogapsParams( singleCell = TRUE, sparseOptimization = TRUE, seed = 123, nIterations = 50000, nPatterns = 3, distributed = 'genome-wide' ) params <- setDistributedParams(params, nSets = 5) pbmc3k.final <- RunCoGAPS(pbmc3k.final, temp.file = TRUE, params = params) ``` ```{r loadThreePatterns, echo=FALSE} con <- url('https://seurat.nygenome.org/cogaps/CoGAPS_threepatterns.rds') CoGAPS_results3 <- readRDS(con) close(con) InstallData("pbmc3k") data("pbmc3k.final") pbmc3k.final[["CoGAPS"]] <- CreateDimReducObject( embeddings =CoGAPS_results3@sampleFactors, loadings = CoGAPS_results3@featureLoadings, key = "CoGAPS_", assay = DefaultAssay(pbmc3k.final) ) ``` The two major lineages of blood cells are categorized as either myeloid or lymphoid. This specialization requires transcriptional diversification during lineage commitment. There are specific genes related to each of these lineages. In our data, CoGAPS identifies distinct patterns that segregate cells by immune lineage as shown below. #### Lymphoid Lineage ```{r threePatternsLymphoidVlnPlot} VlnPlot(pbmc3k.final, features = "CoGAPS_3") ``` #### Myeloid Lineage ```{r threePatternsMyeloidVlnPlot} VlnPlot(pbmc3k.final, features = "CoGAPS_1") ``` ```{r threePatternsScatterPlot} DimPlot(pbmc3k.final, reduction = "CoGAPS", pt.size = 0.5, dims = c(1, 3)) ``` ### Using CoGAPS to Identify Cell Type *AWS was used to run the below section of CoGAPS to look for ten patterns* To learn more about this dataset, type ```?pbmc3k``` ```{r tenPatternsAWSCode, eval = FALSE} InstallData("pbmc3k") data("pbmc3k.final") params <- CogapsParams( singleCell = TRUE, sparseOptimization = TRUE, seed = 123, nIterations = 50000, nPatterns = 10, distributed='genome-wide' ) params <- setDistributedParams(params, nSets = 5) pbmc3k.final <- RunCoGAPS(object = pbmc3k.final, temp.file = TRUE, params = params) ``` ```{r loadTenPatternsResults, echo=FALSE} con <- url('https://seurat.nygenome.org/cogaps/CoGAPS_tenpatterns.rds') CoGAPS_results10<-readRDS(con) close(con) data("pbmc3k.final") pbmc3k.final[["CoGAPS"]] <- CreateDimReducObject( embeddings =CoGAPS_results10@sampleFactors, loadings = CoGAPS_results10@featureLoadings, key = "CoGAPS_", assay = DefaultAssay(pbmc3k.final) ) ``` Both the myeloid or lymphoid lineages give rise to many different cell types critical to the immune system. CoGAPS is able to discern cell type specific patterns, such as those shown below for DC (CoGAPS_3) and B (CoGAPS_4) cells. Importantly, CoGAPS is also able to identify phenotypic subtypes within a population of cells, such as FCGR3A+ Monocytes (CoGAPS_6). #### DC Cells ```{r tenPatternsDcVlnPlot} VlnPlot(pbmc3k.final, features = "CoGAPS_3") ``` #### B Cells ```{r tenPatternsBVlnPlot} VlnPlot(pbmc3k.final, features="CoGAPS_4") ``` #### FCGR3A+ Monocytes ```{r tenPatternsFcgr3aVlnPlot} VlnPlot(pbmc3k.final, features="CoGAPS_6") ``` ```{r tenPatternsScatterPlot} DimPlot(pbmc3k.final, reduction = "CoGAPS", pt.size = 0.5, dims = c(3,4)) DimPlot(pbmc3k.final, reduction = "CoGAPS", pt.size = 0.5, dims = c(3,6)) DimPlot(pbmc3k.final, reduction = "CoGAPS", pt.size = 0.5, dims = c(4,6)) ``` # Running CoGAPS with Seurat Data Locally For example purposes, we will run locally using 5,000 iterations. *Note:* Results may be different because of complier dependence. Boost random number processor was used for this example. To learn more about this dataset, type ```?pbmc3k``` ```{r threePatternsExample, results='hide'} InstallData("pbmc3k") data("pbmc3k.final") pbmc3k.final <- RunCoGAPS( object = pbmc3k.final, nPatterns = 3, nIterations = 5000, outputFrequency = 1000, sparseOptimization = TRUE, nThreads = 1, distributed = "genome-wide", singleCell = TRUE, seed = 891 ) ``` ```{r threePatternsExampleScatterPlot} DimPlot(pbmc3k.final, reduction = "CoGAPS", pt.size = 0.5, dims = c(3,2)) ``` #### Lymphoid Lineage ```{r threePatternsExampleLympVlnPlot} VlnPlot(pbmc3k.final,features="CoGAPS_2") ``` #### Myeloid Lineage ```{r threePatternsExampleMyeVlnPlot} VlnPlot(pbmc3k.final, features="CoGAPS_3") ``` ### Additional Features of CoGAPS #### Uncertainty Matrix In addition to providing the data, the user can also specify an uncertainty measurement - the standard deviation of each entry in the data matrix. By default, CoGAPS assumes that the standard deviation matrix is 10% of the data matrix. This is a reasonable heuristic to use, but for specific types of data you may be able to provide better information. An uncertainty matrix can be specified using the ```uncertainty``` argument when running CoCAPS. ```{r uncertaintyMatrixExample, eval = FALSE} pbmc3k.final <- RunCoGAPS( pbmc3k.final, uncertainty = datMat.uncertainty, nPatterns = 10, nIterations = 100, outputFrequency = 100, sparseOptimization = TRUE, nThreads = 1, singleCell = TRUE, distributed = "genome-wide" ) ``` #### Running CoGAPS in Parallel for Large Datasets Non-Negative Matrix Factorization algorithms typically require long computation times and CoGAPS is no exception. The simplest way to run CoGAPS in parallel is to provide the ```nThreads``` argument when running CoGAPS. This allows the underlying algorithm to run on multiple threads and has no effect on the mathematics of the algorithm. For more information on running CoGAPS in parallel, visit [CoGAPS Vignette](https://bioconductor.org/packages/release/bioc/vignettes/CoGAPS/inst/doc/CoGAPS.html#vignette-version). ```{r parallelExample, eval = FALSE} pbmc3k.final <- RunCoGAPS( pbmc3k.final, nPatterns = 10, nIterations = 100, outputFrequency = 100, sparseOptimization = TRUE, nThreads = 3, singleCell = TRUE, distributed = "genome-wide" ) ``` ### Resoures for CoGAPS Visit the following resources to learn more about CoGAPS and running CoGAPS outside of the Seurat environment: * [CoGAPS Vignette](https://bioconductor.org/packages/release/bioc/vignettes/CoGAPS/inst/doc/CoGAPS.html#introduction) * [Wiki Link](https://github.com/FertigLab/CoGAPS/wiki) ================================================ FILE: docs/conos.Rmd ================================================ --- title: "Integration of datasets using Conos" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: github_document: html_preview: true toc: true toc_depth: 3 fig_width: 16 html_document: df_print: kable theme: united fig_height: 5 fig_width: 16 out_height: 4 --- This vignette demonstrates the use of the Conos package in Seurat. Commands and parameters are based off of the [Conos tutorial](https://github.com/hms-dbmi/conos/blob/master/vignettes/walkthrough.md). If you use Conos in your work, please cite: > *Joint analysis of heterogeneous single-cell RNA-seq dataset collections* > > Nikolas Barkas, Viktor Petukhov, Daria Nikolaeva, Yaroslav Lozinsky, Samuel Demharter, Konstantin Khodosevich, Peter V. Kharchenko > > Nature Methods, 2019. > > doi: [10.1038/s41592-019-0466-z](https://doi.org/10.1038/s41592-019-0466-z) > > GitHub: https://github.com/hms-dbmi/conos ```{r setup, include=FALSE} knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE ) ``` Prerequisites to install: * [Seurat](https://satijalab.org/seurat/install) * [Conos](https://github.com/hms-dbmi/conos) * [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) * [SeuratData](https://github.com/satijalab/seurat-data) ```{r packages} library(conos) library(Seurat) library(SeuratData) library(SeuratWrappers) ``` ## {.tabset .tabset-pills} ### Systematic comparative analysis of human PBMC To learn more about this dataset, type `?pbmcsca` ```{r pbmcsca, results='hide', cache=TRUE} InstallData("pbmcsca") data("pbmcsca") pbmcsca.panel <- SplitObject(pbmcsca, split.by = 'Method') for (i in 1:length(pbmcsca.panel)) { pbmcsca.panel[[i]] <- NormalizeData(pbmcsca.panel[[i]]) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE) } pbmcsca.con <- Conos$new(pbmcsca.panel) pbmcsca.con$buildGraph( k = 15, k.self = 5, space = 'PCA', ncomps = 30, n.odgenes = 2000, matching.method = 'mNN', metric = 'angular', score.component.variance = TRUE, verbose = TRUE ) pbmcsca.con$findCommunities() pbmcsca.con$embedGraph() pbmcsca <- as.Seurat(pbmcsca.con) DimPlot(pbmcsca, reduction = "largeVis", group.by = c("Method", "ident", 'CellType'), ncol = 3) ``` ### Interferon-stimulated and control PBMC To learn more about this dataset, type `?ifnb` ```{r ifnb, results='hide', cache=TRUE} InstallData("ifnb") data("ifnb") ifnb.panel <- SplitObject(ifnb, split.by = 'stim') for (i in 1:length(ifnb.panel)) { ifnb.panel[[i]] <- NormalizeData(ifnb.panel[[i]]) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE) } ifnb.con <- Conos$new(ifnb.panel) ifnb.con$buildGraph( k = 15, k.self = 5, space = 'PCA', ncomps = 30, n.odgenes = 2000, matching.method = 'mNN', metric = 'angular', score.component.variance = TRUE, verbose = TRUE ) ifnb.con$findCommunities() ifnb.con$embedGraph() ifnb <- as.Seurat(ifnb.con) DimPlot(ifnb, reduction = "largeVis", group.by = c("stim", "ident", 'seurat_annotations'), ncol = 3) ``` ### Eight human pancreatic islet datasets To learn more about this dataset, type `?panc8` ```{r pancreas, results='hide', cache=TRUE} InstallData("panc8") data("panc8") panc8.panel <- SplitObject(panc8, split.by = 'replicate') for (i in 1:length(panc8.panel)) { panc8.panel[[i]] <- NormalizeData(panc8.panel[[i]]) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE) } panc8.con <- Conos$new(panc8.panel) panc8.con$buildGraph( k = 15, k.self = 5, space = 'PCA', ncomps = 30, n.odgenes = 2000, matching.method = 'mNN', metric = 'angular', score.component.variance = TRUE, verbose = TRUE ) panc8.con$findCommunities() panc8.con$embedGraph() panc8 <- as.Seurat(panc8.con) DimPlot(panc8, reduction = "largeVis", group.by = c("replicate", "ident", 'celltype'), ncol = 3) ``` ================================================ FILE: docs/conos.html ================================================ Integration of datasets using Conos

This vignette demonstrates the use of the Conos package in Seurat. Commands and parameters are based off of the Conos tutorial. If you use Conos in your work, please cite:

Joint analysis of heterogeneous single-cell RNA-seq dataset collections

Nikolas Barkas, Viktor Petukhov, Daria Nikolaeva, Yaroslav Lozinsky, Samuel Demharter, Konstantin Khodosevich, Peter V. Kharchenko

Nature Methods, 2019.

doi: 10.1038/s41592-019-0466-z

GitHub: https://github.com/hms-dbmi/conos

Prerequisites to install:

library(conos)
library(Seurat)
library(SeuratData)
library(SeuratWrappers)

Systematic comparative analysis of human PBMC

To learn more about this dataset, type ?pbmcsca

InstallData("pbmcsca")
data("pbmcsca")
pbmcsca.panel <- SplitObject(pbmcsca, split.by = "Method")
for (i in 1:length(pbmcsca.panel)) {
    pbmcsca.panel[[i]] <- NormalizeData(pbmcsca.panel[[i]]) %>% FindVariableFeatures() %>% ScaleData() %>% 
        RunPCA(verbose = FALSE)
}
pbmcsca.con <- Conos$new(pbmcsca.panel)
pbmcsca.con$buildGraph(k = 15, k.self = 5, space = "PCA", ncomps = 30, n.odgenes = 2000, matching.method = "mNN", 
    metric = "angular", score.component.variance = TRUE, verbose = TRUE)
pbmcsca.con$findCommunities()
pbmcsca.con$embedGraph()
pbmcsca <- as.Seurat(pbmcsca.con)
DimPlot(pbmcsca, reduction = "largeVis", group.by = c("Method", "ident", "CellType"), ncol = 3)

Interferon-stimulated and control PBMC

To learn more about this dataset, type ?ifnb

InstallData("ifnb")
data("ifnb")
ifnb.panel <- SplitObject(ifnb, split.by = "stim")
for (i in 1:length(ifnb.panel)) {
    ifnb.panel[[i]] <- NormalizeData(ifnb.panel[[i]]) %>% FindVariableFeatures() %>% ScaleData() %>% 
        RunPCA(verbose = FALSE)
}
ifnb.con <- Conos$new(ifnb.panel)
ifnb.con$buildGraph(k = 15, k.self = 5, space = "PCA", ncomps = 30, n.odgenes = 2000, matching.method = "mNN", 
    metric = "angular", score.component.variance = TRUE, verbose = TRUE)
ifnb.con$findCommunities()
ifnb.con$embedGraph()
ifnb <- as.Seurat(ifnb.con)
DimPlot(ifnb, reduction = "largeVis", group.by = c("stim", "ident", "seurat_annotations"), ncol = 3)

Eight human pancreatic islet datasets

To learn more about this dataset, type ?panc8

InstallData("panc8")
data("panc8")
panc8.panel <- SplitObject(panc8, split.by = "replicate")
for (i in 1:length(panc8.panel)) {
    panc8.panel[[i]] <- NormalizeData(panc8.panel[[i]]) %>% FindVariableFeatures() %>% ScaleData() %>% 
        RunPCA(verbose = FALSE)
}
panc8.con <- Conos$new(panc8.panel)
panc8.con$buildGraph(k = 15, k.self = 5, space = "PCA", ncomps = 30, n.odgenes = 2000, matching.method = "mNN", 
    metric = "angular", score.component.variance = TRUE, verbose = TRUE)
panc8.con$findCommunities()
panc8.con$embedGraph()
panc8 <- as.Seurat(panc8.con)
DimPlot(panc8, reduction = "largeVis", group.by = c("replicate", "ident", "celltype"), ncol = 3)

================================================ FILE: docs/conos.md ================================================ Integration of datasets using Conos ================ Compiled: July 15, 2019 - [](#section) - [Systematic comparative analysis of human PBMC](#systematic-comparative-analysis-of-human-pbmc) - [Interferon-stimulated and control PBMC](#interferon-stimulated-and-control-pbmc) - [Eight human pancreatic islet datasets](#eight-human-pancreatic-islet-datasets) This vignette demonstrates the use of the Conos package in Seurat. Commands and parameters are based off of the [Conos tutorial](https://github.com/hms-dbmi/conos/blob/master/vignettes/walkthrough.md). If you use Conos in your work, please cite: > *Joint analysis of heterogeneous single-cell RNA-seq dataset collections* > > Nikolas Barkas, Viktor Petukhov, Daria Nikolaeva, Yaroslav Lozinsky, Samuel Demharter, Konstantin Khodosevich, Peter V. Kharchenko > > Nature Methods, 2019. > > doi: [10.1038/s41592-019-0466-z](https://doi.org/10.1038/s41592-019-0466-z) > > GitHub: Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [Conos](https://github.com/hms-dbmi/conos) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) - [SeuratData](https://github.com/satijalab/seurat-data) ``` r library(conos) library(Seurat) library(SeuratData) library(SeuratWrappers) ``` ### Systematic comparative analysis of human PBMC To learn more about this dataset, type `?pbmcsca` ``` r InstallData("pbmcsca") data("pbmcsca") pbmcsca.panel <- SplitObject(pbmcsca, split.by = "Method") for (i in 1:length(pbmcsca.panel)) { pbmcsca.panel[[i]] <- NormalizeData(pbmcsca.panel[[i]]) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE) } pbmcsca.con <- Conos$new(pbmcsca.panel) pbmcsca.con$buildGraph(k = 15, k.self = 5, space = "PCA", ncomps = 30, n.odgenes = 2000, matching.method = "mNN", metric = "angular", score.component.variance = TRUE, verbose = TRUE) pbmcsca.con$findCommunities() pbmcsca.con$embedGraph() pbmcsca <- as.Seurat(pbmcsca.con) DimPlot(pbmcsca, reduction = "largeVis", group.by = c("Method", "ident", "CellType"), ncol = 3) ``` ![](conos_files/figure-markdown_github/pbmcsca-1.png) ### Interferon-stimulated and control PBMC To learn more about this dataset, type `?ifnb` ``` r InstallData("ifnb") data("ifnb") ifnb.panel <- SplitObject(ifnb, split.by = "stim") for (i in 1:length(ifnb.panel)) { ifnb.panel[[i]] <- NormalizeData(ifnb.panel[[i]]) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE) } ifnb.con <- Conos$new(ifnb.panel) ifnb.con$buildGraph(k = 15, k.self = 5, space = "PCA", ncomps = 30, n.odgenes = 2000, matching.method = "mNN", metric = "angular", score.component.variance = TRUE, verbose = TRUE) ifnb.con$findCommunities() ifnb.con$embedGraph() ifnb <- as.Seurat(ifnb.con) DimPlot(ifnb, reduction = "largeVis", group.by = c("stim", "ident", "seurat_annotations"), ncol = 3) ``` ![](conos_files/figure-markdown_github/ifnb-1.png) ### Eight human pancreatic islet datasets To learn more about this dataset, type `?panc8` ``` r InstallData("panc8") data("panc8") panc8.panel <- SplitObject(panc8, split.by = "replicate") for (i in 1:length(panc8.panel)) { panc8.panel[[i]] <- NormalizeData(panc8.panel[[i]]) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE) } panc8.con <- Conos$new(panc8.panel) panc8.con$buildGraph(k = 15, k.self = 5, space = "PCA", ncomps = 30, n.odgenes = 2000, matching.method = "mNN", metric = "angular", score.component.variance = TRUE, verbose = TRUE) panc8.con$findCommunities() panc8.con$embedGraph() panc8 <- as.Seurat(panc8.con) DimPlot(panc8, reduction = "largeVis", group.by = c("replicate", "ident", "celltype"), ncol = 3) ``` ![](conos_files/figure-markdown_github/pancreas-1.png) ================================================ FILE: docs/fast_mnn.Rmd ================================================ --- title: "Running fastMNN on Seurat Objects" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: github_document: html_preview: true toc: true toc_depth: 3 fig_width: 16 html_document: df_print: kable theme: united fig_height: 5 fig_width: 16 out_height: 4 --- This vigettte demonstrates how to run fastMNN on Seurat objects. Parameters and commands are based off of the [fastMNN help page](https://rdrr.io/github/LTLA/batchelor/man/fastMNN.html). If you use fastMNN, please cite: > *Batch effects in single-cell RNA-sequencing data are corrected by matching mutual nearest neighbors* > > Laleh Haghverdi, Aaron T L Lun, Michael D Morgan & John C Marioni > > Nature Biotechnology, 2018 > > doi: [10.1038/nbt.4091](https://doi.org/10.1038/nbt.4091) > > Bioconductor: https://bioconductor.org/packages/release/bioc/html/batchelor.html ```{r setup, include=FALSE} knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE ) options(timeout = 1000) ``` Prerequisites to install: * [Seurat](https://satijalab.org/seurat/install) * [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) * [SeuratData](https://github.com/satijalab/seurat-data) ```{r packages} library(Seurat) library(SeuratData) library(SeuratWrappers) ``` ## {.tabset .tabset-pills} ### Systematic comparative analysis of human PBMC To learn more about this dataset, type `?pbmcsca` ```{r pbmcsca, results='hide'} InstallData("pbmcsca") data("pbmcsca") pbmcsca <- NormalizeData(pbmcsca) pbmcsca <- FindVariableFeatures(pbmcsca) pbmcsca <- RunFastMNN(object.list = SplitObject(pbmcsca, split.by = 'Method')) pbmcsca <- RunUMAP(pbmcsca, reduction = 'mnn', dims = 1:30) pbmcsca <- FindNeighbors(pbmcsca, reduction = 'mnn', dims = 1:30) pbmcsca <- FindClusters(pbmcsca) DimPlot(pbmcsca, group.by = c('Method', 'ident',"CellType"), ncol = 3) ``` ```{r cleanup_pbmcsca, include=FALSE} rm(pbmcsca) gc(verbose = FALSE) ``` ### Interferon-stimulated and control PBMC To learn more about this dataset, type `?ifnb` ```{r ifnb_stim, results='hide'} InstallData("ifnb") data("ifnb") ifnb <- NormalizeData(ifnb) ifnb <- FindVariableFeatures(ifnb) ifnb <- RunFastMNN(object.list = SplitObject(ifnb, split.by = 'stim')) ifnb <- RunUMAP(ifnb, reduction = 'mnn', dims = 1:30) ifnb <- FindNeighbors(ifnb, reduction = 'mnn', dims = 1:30) ifnb <- FindClusters(ifnb) DimPlot(ifnb, group.by = c('stim', 'ident', 'seurat_annotations'), ncol = 3) ``` ```{r cleanup_ifnb, include=FALSE} rm(ifnb) gc(verbose = FALSE) ``` ### Eight human pancreatic islet datasets To learn more about this dataset, type `?panc8` ```{r pancreas, results='hide'} InstallData("panc8") data("panc8") panc8 <- NormalizeData(panc8) panc8 <- FindVariableFeatures(panc8) panc8 <- RunFastMNN(object.list = SplitObject(panc8, split.by = 'replicate')[c("celseq", "celseq2", "fluidigmc1", "smartseq2")]) panc8 <- RunUMAP(panc8, reduction = 'mnn', dims = 1:30) panc8 <- FindNeighbors(panc8, reduction = 'mnn', dims = 1:30) panc8 <- FindClusters(panc8) DimPlot(panc8, group.by = c('replicate', 'ident', 'celltype'), ncol = 3) ``` ================================================ FILE: docs/fast_mnn.html ================================================ Running fastMNN on Seurat Objects

This vigettte demonstrates how to run fastMNN on Seurat objects. Parameters and commands are based off of the fastMNN help page. If you use fastMNN, please cite:

Batch effects in single-cell RNA-sequencing data are corrected by matching mutual nearest neighbors

Laleh Haghverdi, Aaron T L Lun, Michael D Morgan & John C Marioni

Nature Biotechnology, 2018

doi: 10.1038/nbt.4091

Bioconductor: https://bioconductor.org/packages/release/bioc/html/batchelor.html

Prerequisites to install:

library(Seurat)
library(SeuratData)
library(SeuratWrappers)

Systematic comparative analysis of human PBMC

To learn more about this dataset, type ?pbmcsca

InstallData("pbmcsca")
data("pbmcsca")
pbmcsca <- NormalizeData(pbmcsca)
pbmcsca <- FindVariableFeatures(pbmcsca)
pbmcsca <- RunFastMNN(object.list = SplitObject(pbmcsca, split.by = "Method"))
pbmcsca <- RunUMAP(pbmcsca, reduction = "mnn", dims = 1:30)
pbmcsca <- FindNeighbors(pbmcsca, reduction = "mnn", dims = 1:30)
pbmcsca <- FindClusters(pbmcsca)
DimPlot(pbmcsca, group.by = c("Method", "ident", "CellType"), ncol = 3)

Interferon-stimulated and control PBMC

To learn more about this dataset, type ?ifnb

InstallData("ifnb")
data("ifnb")
ifnb <- NormalizeData(ifnb)
ifnb <- FindVariableFeatures(ifnb)
ifnb <- RunFastMNN(object.list = SplitObject(ifnb, split.by = "stim"))
ifnb <- RunUMAP(ifnb, reduction = "mnn", dims = 1:30)
ifnb <- FindNeighbors(ifnb, reduction = "mnn", dims = 1:30)
ifnb <- FindClusters(ifnb)
DimPlot(ifnb, group.by = c("stim", "ident", "seurat_annotations"), ncol = 3)

Eight human pancreatic islet datasets

To learn more about this dataset, type ?panc8

InstallData("panc8")
data("panc8")
panc8 <- NormalizeData(panc8)
panc8 <- FindVariableFeatures(panc8)
panc8 <- RunFastMNN(object.list = SplitObject(panc8, split.by = "replicate")[c("celseq", "celseq2", 
    "fluidigmc1", "smartseq2")])
panc8 <- RunUMAP(panc8, reduction = "mnn", dims = 1:30)
panc8 <- FindNeighbors(panc8, reduction = "mnn", dims = 1:30)
panc8 <- FindClusters(panc8)
DimPlot(panc8, group.by = c("replicate", "ident", "celltype"), ncol = 3)

================================================ FILE: docs/fast_mnn.md ================================================ Running fastMNN on Seurat Objects ================ Compiled: June 23, 2021 - [](#section) - [Systematic comparative analysis of human PBMC](#systematic-comparative-analysis-of-human-pbmc) - [Interferon-stimulated and control PBMC](#interferon-stimulated-and-control-pbmc) - [Eight human pancreatic islet datasets](#eight-human-pancreatic-islet-datasets) This vigettte demonstrates how to run fastMNN on Seurat objects. Parameters and commands are based off of the [fastMNN help page](https://rdrr.io/github/LTLA/batchelor/man/fastMNN.html). If you use fastMNN, please cite: > *Batch effects in single-cell RNA-sequencing data are corrected by > matching mutual nearest neighbors* > > Laleh Haghverdi, Aaron T L Lun, Michael D Morgan & John C Marioni > > Nature Biotechnology, 2018 > > doi: [10.1038/nbt.4091](https://doi.org/10.1038/nbt.4091) > > Bioconductor: > Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) - [SeuratData](https://github.com/satijalab/seurat-data) ``` r library(Seurat) library(SeuratData) library(SeuratWrappers) ``` ## ### Systematic comparative analysis of human PBMC To learn more about this dataset, type `?pbmcsca` ``` r InstallData("pbmcsca") data("pbmcsca") pbmcsca <- NormalizeData(pbmcsca) pbmcsca <- FindVariableFeatures(pbmcsca) pbmcsca <- RunFastMNN(object.list = SplitObject(pbmcsca, split.by = "Method")) pbmcsca <- RunUMAP(pbmcsca, reduction = "mnn", dims = 1:30) pbmcsca <- FindNeighbors(pbmcsca, reduction = "mnn", dims = 1:30) pbmcsca <- FindClusters(pbmcsca) DimPlot(pbmcsca, group.by = c("Method", "ident", "CellType"), ncol = 3) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/fast_mnn_files/figure-gfm/pbmcsca-1.png) ### Interferon-stimulated and control PBMC To learn more about this dataset, type `?ifnb` ``` r InstallData("ifnb") data("ifnb") ifnb <- NormalizeData(ifnb) ifnb <- FindVariableFeatures(ifnb) ifnb <- RunFastMNN(object.list = SplitObject(ifnb, split.by = "stim")) ifnb <- RunUMAP(ifnb, reduction = "mnn", dims = 1:30) ifnb <- FindNeighbors(ifnb, reduction = "mnn", dims = 1:30) ifnb <- FindClusters(ifnb) DimPlot(ifnb, group.by = c("stim", "ident", "seurat_annotations"), ncol = 3) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/fast_mnn_files/figure-gfm/ifnb_stim-1.png) ### Eight human pancreatic islet datasets To learn more about this dataset, type `?panc8` ``` r InstallData("panc8") data("panc8") panc8 <- NormalizeData(panc8) panc8 <- FindVariableFeatures(panc8) panc8 <- RunFastMNN(object.list = SplitObject(panc8, split.by = "replicate")[c("celseq", "celseq2", "fluidigmc1", "smartseq2")]) panc8 <- RunUMAP(panc8, reduction = "mnn", dims = 1:30) panc8 <- FindNeighbors(panc8, reduction = "mnn", dims = 1:30) panc8 <- FindClusters(panc8) DimPlot(panc8, group.by = c("replicate", "ident", "celltype"), ncol = 3) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/fast_mnn_files/figure-gfm/pancreas-1.png) ================================================ FILE: docs/glmpca.Rmd ================================================ --- title: "Running GLM-PCA on a Seurat Object" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: github_document: html_preview: false toc: false html_document: df_print: kable theme: united --- This vignette demonstrates how to run GLM-PCA, which implements a generalized version of PCA for non-normally distributed data, on a Seurat object. If you use this, please cite: > *Feature selection and dimension reduction for single-cell RNA-Seq based on a multinomial model* > > F. William Townes, Stephanie C. Hicks, Martin J. Aryee & Rafael A. Irizarry > > Genome Biology, 2019 > > doi: https://doi.org/10.1186/s13059-019-1861-6 > > GitHub: https://github.com/willtownes/glmpca > CRAN: https://cran.r-project.org/web/packages/glmpca/index.html ```{r setup, include=FALSE} knitr::opts_chunk$set( message = FALSE, warning = FALSE, fig.width = 10 ) ``` Prerequisites to install: * [Seurat](https://satijalab.org/seurat/install) * [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) * [SeuratData](https://github.com/satijalab/seurat-data) * [glmpca](https://github.com/willtownes/glmpca) * [scry](https://github.com/kstreet13/scry) ```{r packages} library(Seurat) library(SeuratData) library(SeuratWrappers) library(glmpca) library(scry) ``` ### GLM-PCA on PBMC3k To learn more about this dataset, type `?pbmc3k` ```{r glmpca, cache=TRUE, cache.lazy=TRUE} InstallData("pbmc3k") data("pbmc3k") # Initial processing to select variable features m <- GetAssayData(pbmc3k, slot = "counts", assay = "RNA") devs <- scry::devianceFeatureSelection(m) dev_ranked_genes <- rownames(pbmc3k)[order(devs, decreasing = TRUE)] topdev <- head(dev_ranked_genes, 2000) # run GLM-PCA on Seurat object. # Uses Poisson model by default # Note that data in the counts slot is used # We choose 10 dimensions for computational efficiency ndims <- 10 pbmc3k <- RunGLMPCA(pbmc3k, features = topdev, L = ndims) pbmc3k <- FindNeighbors(pbmc3k, reduction = 'glmpca', dims = 1:ndims, verbose = FALSE) pbmc3k <- FindClusters(pbmc3k, verbose = FALSE) pbmc3k <- RunUMAP(pbmc3k, reduction = 'glmpca', dims = 1:ndims, verbose = FALSE) ``` ```{r explore, fig.width=6} # visualize markers features.plot <- c('CD3D', 'MS4A1', 'CD8A', 'GZMK', 'GZMB', 'FCGR3A') DimPlot(pbmc3k) ``` Do the learned clusters overlap with the original annotation? ```{r} with(pbmc3k[[]], table(seurat_annotations, seurat_clusters)) ``` ```{r explore2, fig.height=10} pbmc3k <- NormalizeData(pbmc3k, verbose = FALSE) FeaturePlot(pbmc3k, features.plot, ncol = 2) ``` ================================================ FILE: docs/glmpca.html ================================================ glmpca.utf8

Running GLM-PCA on a Seurat Object

Compiled: July 15, 2020

This vignette demonstrates how to run GLM-PCA, which implements a generalized version of PCA for non-normally distributed data, on a Seurat object. If you use this, please cite:

Feature selection and dimension reduction for single-cell RNA-Seq based on a multinomial model

F. William Townes, Stephanie C. Hicks, Martin J. Aryee & Rafael A. Irizarry

Genome Biology, 2019

doi: https://doi.org/10.1186/s13059-019-1861-6

GitHub: https://github.com/willtownes/glmpca CRAN: https://cran.r-project.org/web/packages/glmpca/index.html

Prerequisites to install:

library(Seurat)
library(SeuratData)
library(SeuratWrappers)
library(glmpca)
library(scry)

GLM-PCA on PBMC3k

To learn more about this dataset, type ?pbmc3k

InstallData("pbmc3k")
data("pbmc3k")

# Initial processing to select variable features
m <- GetAssayData(pbmc3k, slot = "counts", assay = "RNA")
devs <- scry::devianceFeatureSelection(m)
dev_ranked_genes <- rownames(pbmc3k)[order(devs, decreasing = TRUE)]
topdev <- head(dev_ranked_genes, 2000)

# run GLM-PCA on Seurat object. 
# Uses Poisson model by default
# Note that data in the counts slot is used
# We choose 10 dimensions for computational efficiency

ndims <- 10
pbmc3k <- RunGLMPCA(pbmc3k, features = topdev, L = ndims)
pbmc3k <- FindNeighbors(pbmc3k, reduction = 'glmpca', dims = 1:ndims, verbose = FALSE)
pbmc3k <- FindClusters(pbmc3k, verbose = FALSE)
pbmc3k <- RunUMAP(pbmc3k, reduction = 'glmpca', dims = 1:ndims, verbose = FALSE)
# visualize markers
features.plot <- c('CD3D', 'MS4A1', 'CD8A', 'GZMK', 'GZMB', 'FCGR3A')
DimPlot(pbmc3k)

Do the learned clusters overlap with the original annotation?

with(pbmc3k[[]], table(seurat_annotations, seurat_clusters))
##                   seurat_clusters
## seurat_annotations   0   1   2   3   4   5   6   7   8
##       Naive CD4 T  168 484   0   3  42   0   0   0   0
##       Memory CD4 T 405  45   0   0  30   0   0   0   3
##       CD14+ Mono     0   0 469   0   0   8   0   3   0
##       B              0   0   0 344   0   0   0   0   0
##       CD8 T          7   0   0   0 254   0   9   0   1
##       FCGR3A+ Mono   0   0  12   0   0 150   0   0   0
##       NK             0   0   0   0   8   0 147   0   0
##       DC             0   0   2   2   0   0   1  27   0
##       Platelet       0   0   1   0   0   0   0   0  13
pbmc3k <- NormalizeData(pbmc3k, verbose = FALSE) 
FeaturePlot(pbmc3k, features.plot, ncol = 2)

================================================ FILE: docs/glmpca.md ================================================ Running GLM-PCA on a Seurat Object ================ Compiled: July 15, 2020 This vignette demonstrates how to run GLM-PCA, which implements a generalized version of PCA for non-normally distributed data, on a Seurat object. If you use this, please cite: > *Feature selection and dimension reduction for single-cell RNA-Seq > based on a multinomial model* > > F. William Townes, Stephanie C. Hicks, Martin J. Aryee & Rafael A. > Irizarry > > Genome Biology, 2019 > > doi: > > GitHub: CRAN: > Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) - [SeuratData](https://github.com/satijalab/seurat-data) - [glmpca](https://github.com/willtownes/glmpca) - [scry](https://github.com/kstreet13/scry) ``` r library(Seurat) library(SeuratData) library(SeuratWrappers) library(glmpca) library(scry) ``` ### GLM-PCA on PBMC3k To learn more about this dataset, type `?pbmc3k` ``` r InstallData("pbmc3k") data("pbmc3k") # Initial processing to select variable features m <- GetAssayData(pbmc3k, slot = "counts", assay = "RNA") devs <- scry::devianceFeatureSelection(m) dev_ranked_genes <- rownames(pbmc3k)[order(devs, decreasing = TRUE)] topdev <- head(dev_ranked_genes, 2000) # run GLM-PCA on Seurat object. # Uses Poisson model by default # Note that data in the counts slot is used # We choose 10 dimensions for computational efficiency ndims <- 10 pbmc3k <- RunGLMPCA(pbmc3k, features = topdev, L = ndims) pbmc3k <- FindNeighbors(pbmc3k, reduction = 'glmpca', dims = 1:ndims, verbose = FALSE) pbmc3k <- FindClusters(pbmc3k, verbose = FALSE) pbmc3k <- RunUMAP(pbmc3k, reduction = 'glmpca', dims = 1:ndims, verbose = FALSE) ``` ``` r # visualize markers features.plot <- c('CD3D', 'MS4A1', 'CD8A', 'GZMK', 'GZMB', 'FCGR3A') DimPlot(pbmc3k) ``` ![](glmpca_files/figure-markdown_github/explore-1.png) Do the learned clusters overlap with the original annotation? ``` r with(pbmc3k[[]], table(seurat_annotations, seurat_clusters)) ``` ## seurat_clusters ## seurat_annotations 0 1 2 3 4 5 6 7 8 ## Naive CD4 T 168 484 0 3 42 0 0 0 0 ## Memory CD4 T 405 45 0 0 30 0 0 0 3 ## CD14+ Mono 0 0 469 0 0 8 0 3 0 ## B 0 0 0 344 0 0 0 0 0 ## CD8 T 7 0 0 0 254 0 9 0 1 ## FCGR3A+ Mono 0 0 12 0 0 150 0 0 0 ## NK 0 0 0 0 8 0 147 0 0 ## DC 0 0 2 2 0 0 1 27 0 ## Platelet 0 0 1 0 0 0 0 0 13 ``` r pbmc3k <- NormalizeData(pbmc3k, verbose = FALSE) FeaturePlot(pbmc3k, features.plot, ncol = 2) ``` ![](glmpca_files/figure-markdown_github/explore2-1.png) ================================================ FILE: docs/harmony.Rmd ================================================ --- title: "Integration of datasets using Harmony" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: github_document: html_preview: true toc: true toc_depth: 3 fig_width: 16 html_document: df_print: kable theme: united fig_height: 5 fig_width: 16 out_height: 4 --- This vigettte demonstrates the use of the Harmony package in Seurat. Commands and parameters are based off of the [Harmony use page](https://github.com/immunogenomics/harmony). If you use Harmony in your work, please cite: > *Fast, sensitive, and flexible integration of single cell data with Harmony* > > Ilya Korsunsky, Jean Fan, Kamil Slowikowski, Fan Zhang, Kevin Wei, Yuriy Baglaenko, Michael Brenner, Po-Ru Loh, Soumya Raychaudhuri > > bioRxiv, 2019 > > doi: [10.1101/461954v2](https://www.biorxiv.org/content/10.1101/461954v2) > > GitHub: https://github.com/immunogenomics/harmony ```{r setup, include=FALSE} knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE ) ``` Prerequisites to install: * [Seurat](https://satijalab.org/seurat/install) * [Harmony](https://github.com/immunogenomics/harmony) * [SeuratData](https://github.com/satijalab/seurat-data) Note that SeuratWrappers is not necessary, as the wrapper functions were generously provided by the Harmony authors, and are included when installing Harmony. ```{r packages} library(harmony) library(Seurat) library(SeuratData) ``` ## {.tabset .tabset-pills} ### Systematic comparative analysis of human PBMC To learn more about this dataset, type `?pbmcsca` ```{r pbmcsca, results='hide', cache=TRUE} InstallData("pbmcsca") data("pbmcsca") pbmcsca <- NormalizeData(pbmcsca) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE) pbmcsca <- RunHarmony(pbmcsca, group.by.vars = 'Method') pbmcsca <- RunUMAP(pbmcsca, reduction = 'harmony', dims = 1:30) pbmcsca <- FindNeighbors(pbmcsca, reduction = 'harmony', dims = 1:30) %>% FindClusters() DimPlot(pbmcsca, group.by = c('Method', 'ident',"CellType"), ncol = 3) ``` ### Interferon-stimulated and control PBMC To learn more about this dataset, type `?ifnb` ```{r ifnb_stim, results='hide', cache=TRUE} InstallData("ifnb") data("ifnb") ifnb <- NormalizeData(ifnb) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE) ifnb <- RunHarmony(ifnb, group.by.vars = 'stim') ifnb <- RunUMAP(ifnb, reduction = 'harmony', dims = 1:30) ifnb <- FindNeighbors(ifnb, reduction = 'harmony', dims = 1:30) %>% FindClusters() DimPlot(ifnb, group.by = c("stim", "ident", "seurat_annotations"), ncol = 3) ``` ### Eight human pancreatic islet datasets To learn more about this dataset, type `?panc8` ```{r pancreas, results='hide', cache=TRUE} InstallData("panc8") data("panc8") panc8 <- NormalizeData(panc8) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE) panc8 <- RunHarmony(panc8, group.by.vars = 'replicate') panc8 <- RunUMAP(panc8, reduction = 'harmony', dims = 1:30) panc8 <- FindNeighbors(panc8, reduction = 'harmony', dims = 1:30) %>% FindClusters() DimPlot(panc8, group.by = c("replicate", "ident","celltype"), ncol = 3) ``` ================================================ FILE: docs/harmony.html ================================================ Integration of datasets using Harmony

This vigettte demonstrates the use of the Harmony package in Seurat. Commands and parameters are based off of the Harmony use page. If you use Harmony in your work, please cite:

Fast, sensitive, and flexible integration of single cell data with Harmony

Ilya Korsunsky, Jean Fan, Kamil Slowikowski, Fan Zhang, Kevin Wei, Yuriy Baglaenko, Michael Brenner, Po-Ru Loh, Soumya Raychaudhuri

bioRxiv, 2019

doi: 10.1101/461954v2

GitHub: https://github.com/immunogenomics/harmony

Prerequisites to install:

Note that SeuratWrappers is not necessary, as the wrapper functions were generously provided by the Harmony authors, and are included when installing Harmony.

library(harmony)
library(Seurat)
library(SeuratData)

Systematic comparative analysis of human PBMC

To learn more about this dataset, type ?pbmcsca

InstallData("pbmcsca")
data("pbmcsca")
pbmcsca <- NormalizeData(pbmcsca) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE)
pbmcsca <- RunHarmony(pbmcsca, group.by.vars = "Method")
pbmcsca <- RunUMAP(pbmcsca, reduction = "harmony", dims = 1:30)
pbmcsca <- FindNeighbors(pbmcsca, reduction = "harmony", dims = 1:30) %>% FindClusters()
DimPlot(pbmcsca, group.by = c("Method", "ident", "CellType"), ncol = 3)

Interferon-stimulated and control PBMC

To learn more about this dataset, type ?ifnb

InstallData("ifnb")
data("ifnb")
ifnb <- NormalizeData(ifnb) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE)
ifnb <- RunHarmony(ifnb, group.by.vars = "stim")
ifnb <- RunUMAP(ifnb, reduction = "harmony", dims = 1:30)
ifnb <- FindNeighbors(ifnb, reduction = "harmony", dims = 1:30) %>% FindClusters()
DimPlot(ifnb, group.by = c("stim", "ident", "seurat_annotations"), ncol = 3)

Eight human pancreatic islet datasets

To learn more about this dataset, type ?panc8

InstallData("panc8")
data("panc8")
panc8 <- NormalizeData(panc8) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE)
panc8 <- RunHarmony(panc8, group.by.vars = "replicate")
panc8 <- RunUMAP(panc8, reduction = "harmony", dims = 1:30)
panc8 <- FindNeighbors(panc8, reduction = "harmony", dims = 1:30) %>% FindClusters()
DimPlot(panc8, group.by = c("replicate", "ident", "celltype"), ncol = 3)

================================================ FILE: docs/harmony.md ================================================ Integration of datasets using Harmony ================ Compiled: July 15, 2019 - [](#section) - [Systematic comparative analysis of human PBMC](#systematic-comparative-analysis-of-human-pbmc) - [Interferon-stimulated and control PBMC](#interferon-stimulated-and-control-pbmc) - [Eight human pancreatic islet datasets](#eight-human-pancreatic-islet-datasets) This vigettte demonstrates the use of the Harmony package in Seurat. Commands and parameters are based off of the [Harmony use page](https://github.com/immunogenomics/harmony). If you use Harmony in your work, please cite: > *Fast, sensitive, and flexible integration of single cell data with Harmony* > > Ilya Korsunsky, Jean Fan, Kamil Slowikowski, Fan Zhang, Kevin Wei, Yuriy Baglaenko, Michael Brenner, Po-Ru Loh, Soumya Raychaudhuri > > bioRxiv, 2019 > > doi: [10.1101/461954v2](https://www.biorxiv.org/content/10.1101/461954v2) > > GitHub: Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [Harmony](https://github.com/immunogenomics/harmony) - [SeuratData](https://github.com/satijalab/seurat-data) Note that SeuratWrappers is not necessary, as the wrapper functions were generously provided by the Harmony authors, and are included when installing Harmony. ``` r library(harmony) library(Seurat) library(SeuratData) ``` ### Systematic comparative analysis of human PBMC To learn more about this dataset, type `?pbmcsca` ``` r InstallData("pbmcsca") data("pbmcsca") pbmcsca <- NormalizeData(pbmcsca) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE) pbmcsca <- RunHarmony(pbmcsca, group.by.vars = "Method") pbmcsca <- RunUMAP(pbmcsca, reduction = "harmony", dims = 1:30) pbmcsca <- FindNeighbors(pbmcsca, reduction = "harmony", dims = 1:30) %>% FindClusters() DimPlot(pbmcsca, group.by = c("Method", "ident", "CellType"), ncol = 3) ``` ![](harmony_files/figure-markdown_github/pbmcsca-1.png) ### Interferon-stimulated and control PBMC To learn more about this dataset, type `?ifnb` ``` r InstallData("ifnb") data("ifnb") ifnb <- NormalizeData(ifnb) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE) ifnb <- RunHarmony(ifnb, group.by.vars = "stim") ifnb <- RunUMAP(ifnb, reduction = "harmony", dims = 1:30) ifnb <- FindNeighbors(ifnb, reduction = "harmony", dims = 1:30) %>% FindClusters() DimPlot(ifnb, group.by = c("stim", "ident", "seurat_annotations"), ncol = 3) ``` ![](harmony_files/figure-markdown_github/ifnb_stim-1.png) ### Eight human pancreatic islet datasets To learn more about this dataset, type `?panc8` ``` r InstallData("panc8") data("panc8") panc8 <- NormalizeData(panc8) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE) panc8 <- RunHarmony(panc8, group.by.vars = "replicate") panc8 <- RunUMAP(panc8, reduction = "harmony", dims = 1:30) panc8 <- FindNeighbors(panc8, reduction = "harmony", dims = 1:30) %>% FindClusters() DimPlot(panc8, group.by = c("replicate", "ident", "celltype"), ncol = 3) ``` ![](harmony_files/figure-markdown_github/pancreas-1.png) ================================================ FILE: docs/liger.Rmd ================================================ --- title: "Integrating Seurat objects using LIGER" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: github_document: html_preview: true toc: true toc_depth: 3 fig_width: 16 html_document: df_print: kable theme: united fig_height: 5 fig_width: 16 out_height: 4 --- NOTE: Please update your `liger` version to 0.5.0 or above before following this tutorial. This vigettte demonstrates how to run LIGER on Seurat objects. Parameters and commands are based on the [LIGER tutorial](http://htmlpreview.github.io/?https://github.com/MacoskoLab/liger/blob/master/vignettes/Integrating_multi_scRNA_data.html). If you use LIGER, please cite: > *Single-Cell Multi-omic Integration Compares and Contrasts Features of Brain Cell Identity* > > Joshua Welch, Velina Kozareva, Ashley Ferreira, Charles Vanderburg, Carly Martin, Evan Z.Macosko > > Cell, 2019. > > doi: [10.1016/j.cell.2019.05.006](https://doi.org/10.1016/j.cell.2019.05.006) > > GitHub: https://github.com/MacoskoLab/liger ```{r setup, include=FALSE} knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE ) ``` Prerequisites to install: * [Seurat](https://satijalab.org/seurat/install) * [LIGER](https://github.com/MacoskoLab/liger) * [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) * [SeuratData](https://github.com/satijalab/seurat-data) ```{r check, include=FALSE} tryCatch( expr = SeuratWrappers:::CheckPackage("rliger", "cran"), error = { install.packages("rliger") } ) ``` ```{r packages} library(rliger) library(Seurat) library(SeuratData) library(SeuratWrappers) ``` In order to replicate LIGER's multi-dataset functionality, we will use the `split.by` parameter to preprocess the Seurat object on subsets of the data belonging to each dataset separately. Also, as LIGER does not center data when scaling, we will skip that step as well. `RunQuantileNorm` produces joint clusters, but users can also optionally perform Louvain community detection (`FindNeighbors` and `FindClusters`) on the integrated latent space from iNMF. ## {.tabset .tabset-pills} ### Systematic comparative analysis of human PBMC To learn more about this dataset, type `?pbmcsca` ```{r pbmcsca, results='hide', fig.height = 6, fig.width = 18} InstallData("pbmcsca") data("pbmcsca") # Please update your `liger` version to 0.5.0 or above before following this tutorial pbmcsca <- NormalizeData(pbmcsca) pbmcsca <- FindVariableFeatures(pbmcsca) pbmcsca <- ScaleData(pbmcsca, split.by = 'Method', do.center = FALSE) pbmcsca <- RunOptimizeALS(pbmcsca, k = 20, lambda = 5, split.by = 'Method') pbmcsca <- RunQuantileNorm(pbmcsca, split.by = 'Method') # You can optionally perform Louvain clustering (`FindNeighbors` and `FindClusters`) after `RunQuantileNorm` according to your needs pbmcsca <- FindNeighbors(pbmcsca, reduction = 'iNMF', dims = 1:20) pbmcsca <- FindClusters(pbmcsca, resolution = 0.3) # Dimensional reduction and plotting pbmcsca <- RunUMAP(pbmcsca, dims = 1:ncol(pbmcsca[['iNMF']]), reduction = 'iNMF') DimPlot(pbmcsca, group.by = c('Method', 'ident', 'CellType'), ncol = 3) ``` ### Interferon-stimulated and control PBMC To learn more about this dataset, type `?ifnb` ```{r ifnb, results='hide', fig.height = 6, fig.width = 18} InstallData("ifnb") data("ifnb") # Please update your `liger` version to 0.5.0 or above before following this tutorial. ifnb <- NormalizeData(ifnb) ifnb <- FindVariableFeatures(ifnb) ifnb <- ScaleData(ifnb, split.by = 'stim', do.center = FALSE) ifnb <- RunOptimizeALS(ifnb, k = 20, lambda = 5, split.by = 'stim') ifnb <- RunQuantileNorm(ifnb, split.by = 'stim') # You can optionally perform Louvain clustering (`FindNeighbors` and `FindClusters`) after `RunQuantileNorm` according to your needs ifnb <- FindNeighbors(ifnb, reduction = 'iNMF', dims = 1:20) ifnb <- FindClusters(ifnb, resolution = 0.55) # Dimensional reduction and plotting ifnb <- RunUMAP(ifnb, dims = 1:ncol(ifnb[['iNMF']]), reduction = 'iNMF') DimPlot(ifnb, group.by = c('stim', 'ident', 'seurat_annotations'), ncol = 3) ``` ### Eight human pancreatic islet datasets To learn more about this dataset, type `?panc8` ```{r pancreas, results='hide', fig.height = 6, fig.width = 18} InstallData("panc8") data("panc8") # Please update your `liger` version to 0.5.0 or above before following this tutorial. panc8 <- NormalizeData(panc8) panc8 <- FindVariableFeatures(panc8) panc8 <- ScaleData(panc8, split.by = 'replicate', do.center = FALSE) panc8 <- RunOptimizeALS(panc8, k = 20, lambda = 5, split.by = 'replicate') panc8 <- RunQuantileNorm(panc8, split.by = 'replicate') # You can optionally perform Louvain clustering (`FindNeighbors` and `FindClusters`) after `RunQuantileNorm` according to your needs panc8 <- FindNeighbors(panc8, reduction = 'iNMF', dims = 1:20) panc8 <- FindClusters(panc8, resolution = 0.4) # Dimensional reduction and plotting panc8 <- RunUMAP(panc8, dims = 1:ncol(panc8[['iNMF']]), reduction = 'iNMF') DimPlot(panc8, group.by = c('replicate', 'ident', 'celltype'), ncol = 3) ``` ================================================ FILE: docs/liger.html ================================================ Integrating Seurat objects using LIGER

NOTE: Please update your liger version to 0.5.0 or above before following this tutorial.

This vigettte demonstrates how to run LIGER on Seurat objects. Parameters and commands are based on the LIGER tutorial. If you use LIGER, please cite:

Single-Cell Multi-omic Integration Compares and Contrasts Features of Brain Cell Identity

Joshua Welch, Velina Kozareva, Ashley Ferreira, Charles Vanderburg, Carly Martin, Evan Z.Macosko

Cell, 2019.

doi: 10.1016/j.cell.2019.05.006

GitHub: https://github.com/MacoskoLab/liger

Prerequisites to install:

library(rliger)
library(Seurat)
library(SeuratData)
library(SeuratWrappers)

In order to replicate LIGER’s multi-dataset functionality, we will use the split.by parameter to preprocess the Seurat object on subsets of the data belonging to each dataset separately. Also, as LIGER does not center data when scaling, we will skip that step as well.

RunQuantileNorm produces joint clusters, but users can also optionally perform Louvain community detection (FindNeighbors and FindClusters) on the integrated latent space from iNMF.

Systematic comparative analysis of human PBMC

To learn more about this dataset, type ?pbmcsca

InstallData("pbmcsca")
data("pbmcsca")
# Please update your `liger` version to 0.5.0 or above before following this tutorial
pbmcsca <- NormalizeData(pbmcsca)
pbmcsca <- FindVariableFeatures(pbmcsca)
pbmcsca <- ScaleData(pbmcsca, split.by = "Method", do.center = FALSE)
pbmcsca <- RunOptimizeALS(pbmcsca, k = 20, lambda = 5, split.by = "Method")
pbmcsca <- RunQuantileNorm(pbmcsca, split.by = "Method")
# You can optionally perform Louvain clustering (`FindNeighbors` and `FindClusters`) after
# `RunQuantileNorm` according to your needs
pbmcsca <- FindNeighbors(pbmcsca, reduction = "iNMF", dims = 1:20)
pbmcsca <- FindClusters(pbmcsca, resolution = 0.3)
# Dimensional reduction and plotting
pbmcsca <- RunUMAP(pbmcsca, dims = 1:ncol(pbmcsca[["iNMF"]]), reduction = "iNMF")
DimPlot(pbmcsca, group.by = c("Method", "ident", "CellType"), ncol = 3)

Interferon-stimulated and control PBMC

To learn more about this dataset, type ?ifnb

InstallData("ifnb")
data("ifnb")
# Please update your `liger` version to 0.5.0 or above before following this tutorial.
ifnb <- NormalizeData(ifnb)
ifnb <- FindVariableFeatures(ifnb)
ifnb <- ScaleData(ifnb, split.by = "stim", do.center = FALSE)
ifnb <- RunOptimizeALS(ifnb, k = 20, lambda = 5, split.by = "stim")
ifnb <- RunQuantileNorm(ifnb, split.by = "stim")
# You can optionally perform Louvain clustering (`FindNeighbors` and `FindClusters`) after
# `RunQuantileNorm` according to your needs
ifnb <- FindNeighbors(ifnb, reduction = "iNMF", dims = 1:20)
ifnb <- FindClusters(ifnb, resolution = 0.55)
# Dimensional reduction and plotting
ifnb <- RunUMAP(ifnb, dims = 1:ncol(ifnb[["iNMF"]]), reduction = "iNMF")
DimPlot(ifnb, group.by = c("stim", "ident", "seurat_annotations"), ncol = 3)

Eight human pancreatic islet datasets

To learn more about this dataset, type ?panc8

InstallData("panc8")
data("panc8")
# Please update your `liger` version to 0.5.0 or above before following this tutorial.
panc8 <- NormalizeData(panc8)
panc8 <- FindVariableFeatures(panc8)
panc8 <- ScaleData(panc8, split.by = "replicate", do.center = FALSE)
panc8 <- RunOptimizeALS(panc8, k = 20, lambda = 5, split.by = "replicate")
panc8 <- RunQuantileNorm(panc8, split.by = "replicate")
# You can optionally perform Louvain clustering (`FindNeighbors` and `FindClusters`) after
# `RunQuantileNorm` according to your needs
panc8 <- FindNeighbors(panc8, reduction = "iNMF", dims = 1:20)
panc8 <- FindClusters(panc8, resolution = 0.4)
# Dimensional reduction and plotting
panc8 <- RunUMAP(panc8, dims = 1:ncol(panc8[["iNMF"]]), reduction = "iNMF")
DimPlot(panc8, group.by = c("replicate", "ident", "celltype"), ncol = 3)

================================================ FILE: docs/liger.md ================================================ Integrating Seurat objects using LIGER ================ Compiled: May 25, 2021 - [](#section) - [Systematic comparative analysis of human PBMC](#systematic-comparative-analysis-of-human-pbmc) - [Interferon-stimulated and control PBMC](#interferon-stimulated-and-control-pbmc) - [Eight human pancreatic islet datasets](#eight-human-pancreatic-islet-datasets) NOTE: Please update your `liger` version to 0.5.0 or above before following this tutorial. This vigettte demonstrates how to run LIGER on Seurat objects. Parameters and commands are based on the [LIGER tutorial](http://htmlpreview.github.io/?https://github.com/MacoskoLab/liger/blob/master/vignettes/Integrating_multi_scRNA_data.html). If you use LIGER, please cite: > *Single-Cell Multi-omic Integration Compares and Contrasts Features of > Brain Cell Identity* > > Joshua Welch, Velina Kozareva, Ashley Ferreira, Charles Vanderburg, > Carly Martin, Evan Z.Macosko > > Cell, 2019. > > doi: > [10.1016/j.cell.2019.05.006](https://doi.org/10.1016/j.cell.2019.05.006) > > GitHub: Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [LIGER](https://github.com/MacoskoLab/liger) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) - [SeuratData](https://github.com/satijalab/seurat-data) ``` r library(rliger) library(Seurat) library(SeuratData) library(SeuratWrappers) ``` In order to replicate LIGER’s multi-dataset functionality, we will use the `split.by` parameter to preprocess the Seurat object on subsets of the data belonging to each dataset separately. Also, as LIGER does not center data when scaling, we will skip that step as well. `RunQuantileNorm` produces joint clusters, but users can also optionally perform Louvain community detection (`FindNeighbors` and `FindClusters`) on the integrated latent space from iNMF. ## ### Systematic comparative analysis of human PBMC To learn more about this dataset, type `?pbmcsca` ``` r InstallData("pbmcsca") data("pbmcsca") # Please update your `liger` version to 0.5.0 or above before following this tutorial pbmcsca <- NormalizeData(pbmcsca) pbmcsca <- FindVariableFeatures(pbmcsca) pbmcsca <- ScaleData(pbmcsca, split.by = "Method", do.center = FALSE) pbmcsca <- RunOptimizeALS(pbmcsca, k = 20, lambda = 5, split.by = "Method") pbmcsca <- RunQuantileNorm(pbmcsca, split.by = "Method") # You can optionally perform Louvain clustering (`FindNeighbors` and `FindClusters`) after # `RunQuantileNorm` according to your needs pbmcsca <- FindNeighbors(pbmcsca, reduction = "iNMF", dims = 1:20) pbmcsca <- FindClusters(pbmcsca, resolution = 0.3) # Dimensional reduction and plotting pbmcsca <- RunUMAP(pbmcsca, dims = 1:ncol(pbmcsca[["iNMF"]]), reduction = "iNMF") DimPlot(pbmcsca, group.by = c("Method", "ident", "CellType"), ncol = 3) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/liger_files/figure-gfm/pbmcsca-1.png) ### Interferon-stimulated and control PBMC To learn more about this dataset, type `?ifnb` ``` r InstallData("ifnb") data("ifnb") # Please update your `liger` version to 0.5.0 or above before following this tutorial. ifnb <- NormalizeData(ifnb) ifnb <- FindVariableFeatures(ifnb) ifnb <- ScaleData(ifnb, split.by = "stim", do.center = FALSE) ifnb <- RunOptimizeALS(ifnb, k = 20, lambda = 5, split.by = "stim") ifnb <- RunQuantileNorm(ifnb, split.by = "stim") # You can optionally perform Louvain clustering (`FindNeighbors` and `FindClusters`) after # `RunQuantileNorm` according to your needs ifnb <- FindNeighbors(ifnb, reduction = "iNMF", dims = 1:20) ifnb <- FindClusters(ifnb, resolution = 0.55) # Dimensional reduction and plotting ifnb <- RunUMAP(ifnb, dims = 1:ncol(ifnb[["iNMF"]]), reduction = "iNMF") DimPlot(ifnb, group.by = c("stim", "ident", "seurat_annotations"), ncol = 3) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/liger_files/figure-gfm/ifnb-1.png) ### Eight human pancreatic islet datasets To learn more about this dataset, type `?panc8` ``` r InstallData("panc8") data("panc8") # Please update your `liger` version to 0.5.0 or above before following this tutorial. panc8 <- NormalizeData(panc8) panc8 <- FindVariableFeatures(panc8) panc8 <- ScaleData(panc8, split.by = "replicate", do.center = FALSE) panc8 <- RunOptimizeALS(panc8, k = 20, lambda = 5, split.by = "replicate") panc8 <- RunQuantileNorm(panc8, split.by = "replicate") # You can optionally perform Louvain clustering (`FindNeighbors` and `FindClusters`) after # `RunQuantileNorm` according to your needs panc8 <- FindNeighbors(panc8, reduction = "iNMF", dims = 1:20) panc8 <- FindClusters(panc8, resolution = 0.4) # Dimensional reduction and plotting panc8 <- RunUMAP(panc8, dims = 1:ncol(panc8[["iNMF"]]), reduction = "iNMF") DimPlot(panc8, group.by = c("replicate", "ident", "celltype"), ncol = 3) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/liger_files/figure-gfm/pancreas-1.png) ================================================ FILE: docs/miQC.Rmd ================================================ --- title: "Running miQC on Seurat objects" date: "Compiled: `r format(Sys.time(), '%B %d, %Y')`" output: html_document: df_print: kable theme: united github_document: html_preview: no toc: no --- This vigettte demonstrates the use of the miQC package in Seurat. Vignette is based off of the [miQC vignette](https://github.com/greenelab/miQC). If you use miQC in your work, please cite: > *miQC: An adaptive probabilistic framework for quality control of single-cell RNA-sequencing data* > > Ariel A. Hippen, Matias M. Falco, Lukas M. Weber, Erdogan Pekcan Erkan, Kaiyang Zhang, Jennifer Anne Doherty, Anna Vähärautio, Casey S. Greene, Stephanie C. Hicks > > bioRxiv, 2021 > > doi: [10.1101/2021.03.03.433798](https://www.biorxiv.org/content/10.1101/2021.03.03.433798v1) > > GitHub: https://github.com/greenelab/miQC ```{r options, include=FALSE, message=FALSE, warning=FALSE} knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE ) remotes::install_github("greenelab/miQC") ``` Prerequisites to install: * [Seurat](https://satijalab.org/seurat/install) * [SeuratData](https://github.com/satijalab/seurat-data) * [flexmix](https://cran.r-project.org/web/packages/flexmix/index.html) which is wrapped by the [miQC](https://github.com/greenelab/miQC) package. * _At this point, the miQC algorithm has been adapted for use in Seurat through installation of flexmix only_. ```{r packages} library(Seurat) library(SeuratData) library(SeuratWrappers) library(flexmix) ``` ## Introduction This vignette provides a basic example of how to run miQC, which allows users to perform cell-wise filtering of single-cell RNA-seq data for quality control. Single-cell RNA-seq data is very sensitive to tissue quality and choice of experimental workflow; it’s critical to ensure compromised cells and failed cell libraries are removed. A high proportion of reads mapping to mitochondrial DNA is one sign of a damaged cell, so most analyses will remove cells with mtRNA over a certain threshold, but those thresholds can be arbitrary and/or detrimentally stringent, especially for archived tumor tissues. miQC jointly models both the proportion of reads mapping to mtDNA genes and the number of detected genes with mixture models in a probabilistic framework to identify the low-quality cells in a given dataset. ## Example data To demonstrate how to run miQC on a single-cell RNA-seq dataset, we'll use the `pbmc3k`dataset from the SeuratData package. ```{r load pbmc3k} InstallData("pbmc3k") data("pbmc3k") pbmc3k ``` ## Seurat preprocessing _miQC_ requires two QC metrics for each single cell dataset: (1) the number of unique genes detected per cell and (2) the percent mitochondrial reads. The number of unique genes detected per cell are typically calculated and stored automatically as metadata (*nFeature_RNA*) upon creation of a Seurat object with `CreateSeuratObject`. In order to calculate the percent mitochondrial reads in a cell we can use `PercentageFeatureSet`. Human mitochondrial genes start with _MT-_ (and _mt-_ for murine genes). For other IDs, we recommend using a _biomaRt_ query to map to chromosomal location and identify all mitochondrial genes. We add this as metadata here to the Seurat object as `"percent.mt"`. ```{r} pbmc3k[["percent.mt"]] <- PercentageFeatureSet(object = pbmc3k, pattern = "^MT-") ``` ## miQC We can visually inspect the `"percent.mt"` and `"nFeature_RNA"` values in the `pbmc3k` dataset. ``` {r} FeatureScatter(pbmc3k, feature1 = "nFeature_RNA", feature2 = "percent.mt") ``` We can see that most cells have a fairly low proportion of mitochondrial reads, given that the graph is much denser at the bottom. We likely have many cells that are intact and biologically meaningful. There are also a few cells that have almost half of their reads mapping to mitochondrial genes, which are likely broken or otherwise compromised and we will want to exclude from our downstream analysis. However, it's not clear what boundaries to draw to separate the two groups of cells. With that in mind, we'll generate a linear mixture model using the `RunMiQC` function. The linear mixture model will be stored in the `misc` slot of the Seurat object as `"flexmix_model"`. ```{r} pbmc3k <- RunMiQC(pbmc3k, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", posterior.cutoff = 0.75, model.slot = "flexmix_model") ``` This function is a wrapper for _flexmix_, which fits a mixture model on our data and returns the parameters of the two lines that best fit the data, as well as the posterior probability of each cell being derived from each distribution. We can look at the parameters and posterior values directly with the functions ``` {r} flexmix::parameters(Misc(pbmc3k, "flexmix_model")) head(flexmix::posterior(Misc(pbmc3k, "flexmix_model"))) ``` Or we can visualize the model results using the _PlotMiQC_ function, where `"miQC.probability"` represents the posterior probability of the cell belonging to the compromised condition: ```{r} PlotMiQC(pbmc3k, color.by = "miQC.probability")+ ggplot2::scale_color_gradient(low = "grey", high = "purple") ``` As expected, the cells at the very top of the graph are almost certainly compromised, most likely to have been derived from the distribution with fewer unique genes and higher baseline mitochondrial expression. We can use these posterior probabilities to choose which cells to keep, and visualize the consequences of this filtering with the _PlotMiQC_ function. Recall when running `"RunMiQC"` we set the `"posterior.cutoff"` to be 0.75. ```{r} PlotMiQC(pbmc3k, color.by = "miQC.keep") ``` To actually perform the filtering and remove the indicated cells from our Seurat object, we can subset the Seurat object parameter as such: ```{r} pbmc3k_filtered <- subset(pbmc3k, miQC.keep == "keep") pbmc3k_filtered ``` ## Extras In most cases, a linear mixture model will be satisfactory as well as simplest, but _RunMiQC_ also supports some non-linear mixture models: currently polynomials and b-splines. A user should only need to change the _model.type_ parameter when making the model, and all visualization and filtering functions will work the same as with a linear model. ```{r} pbmc3k <- RunMiQC(pbmc3k, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", posterior.cutoff = 0.75, model.slot = "flexmix_model", model.type = "spline") PlotMiQC(pbmc3k, color.by = "miQC.keep") ``` Also, _RunMiQC_ defaults to removing any cell with 75% or greater posterior probability of being compromised, but if we want to be more or less stringent, we can alter the _posterior.cutoff_ parameter, like so: ```{r} pbmc3k <- RunMiQC(pbmc3k, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", posterior.cutoff = 0.9, model.slot = "flexmix_model") PlotMiQC(pbmc3k, color.by = "miQC.keep") ``` Note that when performing miQC multiple times on different samples for the same experiment, it's recommended to select the same _posterior_cutoff_ for all, to give consistency in addition to the flexibility of sample-specific models. ## When not to use miQC The miQC model is based on the assumption that there are a non-trivial number of compromised cells in the dataset, which is not true in all datasets. We recommend using _FeatureScatter_ on a dataset before running miQC to see if the two-distribution model is appropriate. Look for the distinctive triangular shape where cells have a wide variety of mitochondrial percentages at lower gene counts and taper off to lower mitochondrial percentage at higher gene counts. For example of a dataset where there's not a significant number of compromised cells, so the two-distribution assumption is not met, we simulate an extreme case using the `"pbmc3k"` dataset here. ```{r} set.seed(2021) pbmc3k_extreme <- pbmc3k simulated_percent_mt <- rnorm(mean = 2.5, sd = 0.2, n = ncol(pbmc3k_extreme)) pbmc3k_extreme$percent.mt <- ifelse(pbmc3k_extreme$nFeature_RNA > 400, simulated_percent_mt, pbmc3k_extreme$percent.mt) simulated_percent_mt_2 <- runif(min = 0, max = 60, n = ncol(pbmc3k_extreme)) pbmc3k_extreme$percent.mt <- ifelse(pbmc3k_extreme$nFeature_RNA < 400, simulated_percent_mt_2, pbmc3k_extreme$percent.mt) FeatureScatter(pbmc3k_extreme, feature1 = "nFeature_RNA", feature2 = "percent.mt") ``` The _RunMiQC_ function will throw a warning if only one distribution is found. In these cases, we recommend using other filtering methods, such as a cutoff on mitochondrial percentage or percentile using the `"backup.option"` parameter to one of `"c("percentile", "percent", "pass", "halt")`. ```{r, warning=TRUE,message=TRUE} pbmc3k_extreme <- RunMiQC(pbmc3k_extreme, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", posterior.cutoff = 0.9, model.slot = "flexmix_model", backup.option = "percentile", backup.percentile = 0.95) FeatureScatter(pbmc3k_extreme, feature1 = "nFeature_RNA", feature2 = "percent.mt", group.by = "miQC.keep") ``` # Session Information ```{r, echo=FALSE} ## Session info options(width = 120) sessionInfo() ``` ================================================ FILE: docs/miQC.html ================================================ Running miQC on Seurat objects

This vigettte demonstrates the use of the miQC package in Seurat. Vignette is based off of the miQC vignette. If you use miQC in your work, please cite:

miQC: An adaptive probabilistic framework for quality control of single-cell RNA-sequencing data

Ariel A. Hippen, Matias M. Falco, Lukas M. Weber, Erdogan Pekcan Erkan, Kaiyang Zhang, Jennifer Anne Doherty, Anna Vähärautio, Casey S. Greene, Stephanie C. Hicks

bioRxiv, 2021

doi: 10.1101/2021.03.03.433798

GitHub: https://github.com/greenelab/miQC

Prerequisites to install:

  • Seurat
  • SeuratData
  • flexmix which is wrapped by the miQC package.
    • At this point, the miQC algorithm has been adapted for use in Seurat through installation of flexmix only.
library(Seurat)
library(SeuratData)
library(SeuratWrappers)
library(flexmix)

Introduction

This vignette provides a basic example of how to run miQC, which allows users to perform cell-wise filtering of single-cell RNA-seq data for quality control. Single-cell RNA-seq data is very sensitive to tissue quality and choice of experimental workflow; it’s critical to ensure compromised cells and failed cell libraries are removed. A high proportion of reads mapping to mitochondrial DNA is one sign of a damaged cell, so most analyses will remove cells with mtRNA over a certain threshold, but those thresholds can be arbitrary and/or detrimentally stringent, especially for archived tumor tissues. miQC jointly models both the proportion of reads mapping to mtDNA genes and the number of detected genes with mixture models in a probabilistic framework to identify the low-quality cells in a given dataset.

Example data

To demonstrate how to run miQC on a single-cell RNA-seq dataset, we’ll use the pbmc3kdataset from the SeuratData package.

InstallData("pbmc3k")
data("pbmc3k")
pbmc3k
## An object of class Seurat 
## 13714 features across 2700 samples within 1 assay 
## Active assay: RNA (13714 features, 0 variable features)

Seurat preprocessing

miQC requires two QC metrics for each single cell dataset: (1) the number of unique genes detected per cell and (2) the percent mitochondrial reads. The number of unique genes detected per cell are typically calculated and stored automatically as metadata (nFeature_RNA) upon creation of a Seurat object with CreateSeuratObject.

In order to calculate the percent mitochondrial reads in a cell we can use PercentageFeatureSet. Human mitochondrial genes start with MT- (and mt- for murine genes). For other IDs, we recommend using a biomaRt query to map to chromosomal location and identify all mitochondrial genes. We add this as metadata here to the Seurat object as "percent.mt".

pbmc3k[["percent.mt"]] <- PercentageFeatureSet(object = pbmc3k, pattern = "^MT-")

miQC

We can visually inspect the "percent.mt" and "nFeature_RNA" values in the pbmc3k dataset.

FeatureScatter(pbmc3k, feature1 = "nFeature_RNA", feature2 = "percent.mt")

We can see that most cells have a fairly low proportion of mitochondrial reads, given that the graph is much denser at the bottom. We likely have many cells that are intact and biologically meaningful. There are also a few cells that have almost half of their reads mapping to mitochondrial genes, which are likely broken or otherwise compromised and we will want to exclude from our downstream analysis. However, it’s not clear what boundaries to draw to separate the two groups of cells. With that in mind, we’ll generate a linear mixture model using the RunMiQC function. The linear mixture model will be stored in the misc slot of the Seurat object as "flexmix_model".

pbmc3k <- RunMiQC(pbmc3k, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", posterior.cutoff = 0.75, 
    model.slot = "flexmix_model")

This function is a wrapper for flexmix, which fits a mixture model on our data and returns the parameters of the two lines that best fit the data, as well as the posterior probability of each cell being derived from each distribution.

We can look at the parameters and posterior values directly with the functions

flexmix::parameters(Misc(pbmc3k, "flexmix_model"))
##                         Comp.1       Comp.2
## coef.(Intercept)  2.004939e+00  7.141952783
## coef.nFeature_RNA 3.222184e-05 -0.004138082
## sigma             7.409008e-01  2.121678523
head(flexmix::posterior(Misc(pbmc3k, "flexmix_model")))
##           [,1]       [,2]
## [1,] 0.9287557 0.07124429
## [2,] 0.7600390 0.23996098
## [3,] 0.9195142 0.08048576
## [4,] 0.9710883 0.02891168
## [5,] 0.9873697 0.01263027
## [6,] 0.9782177 0.02178231

Or we can visualize the model results using the PlotMiQC function, where "miQC.probability" represents the posterior probability of the cell belonging to the compromised condition:

PlotMiQC(pbmc3k, color.by = "miQC.probability") + ggplot2::scale_color_gradient(low = "grey", high = "purple")

As expected, the cells at the very top of the graph are almost certainly compromised, most likely to have been derived from the distribution with fewer unique genes and higher baseline mitochondrial expression.

We can use these posterior probabilities to choose which cells to keep, and visualize the consequences of this filtering with the PlotMiQC function. Recall when running "RunMiQC" we set the "posterior.cutoff" to be 0.75.

PlotMiQC(pbmc3k, color.by = "miQC.keep")

To actually perform the filtering and remove the indicated cells from our Seurat object, we can subset the Seurat object parameter as such:

pbmc3k_filtered <- subset(pbmc3k, miQC.keep == "keep")
pbmc3k_filtered
## An object of class Seurat 
## 13714 features across 2593 samples within 1 assay 
## Active assay: RNA (13714 features, 0 variable features)

Extras

In most cases, a linear mixture model will be satisfactory as well as simplest, but RunMiQC also supports some non-linear mixture models: currently polynomials and b-splines. A user should only need to change the model.type parameter when making the model, and all visualization and filtering functions will work the same as with a linear model.

pbmc3k <- RunMiQC(pbmc3k, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", posterior.cutoff = 0.75, 
    model.slot = "flexmix_model", model.type = "spline")
PlotMiQC(pbmc3k, color.by = "miQC.keep")

Also, RunMiQC defaults to removing any cell with 75% or greater posterior probability of being compromised, but if we want to be more or less stringent, we can alter the posterior.cutoff parameter, like so:

pbmc3k <- RunMiQC(pbmc3k, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", posterior.cutoff = 0.9, 
    model.slot = "flexmix_model")
PlotMiQC(pbmc3k, color.by = "miQC.keep")

Note that when performing miQC multiple times on different samples for the same experiment, it’s recommended to select the same posterior_cutoff for all, to give consistency in addition to the flexibility of sample-specific models.

When not to use miQC

The miQC model is based on the assumption that there are a non-trivial number of compromised cells in the dataset, which is not true in all datasets. We recommend using FeatureScatter on a dataset before running miQC to see if the two-distribution model is appropriate. Look for the distinctive triangular shape where cells have a wide variety of mitochondrial percentages at lower gene counts and taper off to lower mitochondrial percentage at higher gene counts.

For example of a dataset where there’s not a significant number of compromised cells, so the two-distribution assumption is not met, we simulate an extreme case using the "pbmc3k" dataset here.

set.seed(2021)
pbmc3k_extreme <- pbmc3k
simulated_percent_mt <- rnorm(mean = 2.5, sd = 0.2, n = ncol(pbmc3k_extreme))
pbmc3k_extreme$percent.mt <- ifelse(pbmc3k_extreme$nFeature_RNA > 400, simulated_percent_mt, pbmc3k_extreme$percent.mt)
simulated_percent_mt_2 <- runif(min = 0, max = 60, n = ncol(pbmc3k_extreme))
pbmc3k_extreme$percent.mt <- ifelse(pbmc3k_extreme$nFeature_RNA < 400, simulated_percent_mt_2, pbmc3k_extreme$percent.mt)
FeatureScatter(pbmc3k_extreme, feature1 = "nFeature_RNA", feature2 = "percent.mt")

The RunMiQC function will throw a warning if only one distribution is found. In these cases, we recommend using other filtering methods, such as a cutoff on mitochondrial percentage or percentile using the "backup.option" parameter to one of "c("percentile", "percent", "pass", "halt").

pbmc3k_extreme <- RunMiQC(pbmc3k_extreme, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", 
    posterior.cutoff = 0.9, model.slot = "flexmix_model", backup.option = "percentile", backup.percentile = 0.95)
## Warning in RunMiQC(pbmc3k_extreme, percent.mt = "percent.mt", nFeature_RNA =
## "nFeature_RNA", : flexmix returned only 1 cluster
## defaulting to backup.percentile for filtering
## Warning: Adding a command log without an assay associated with it
FeatureScatter(pbmc3k_extreme, feature1 = "nFeature_RNA", feature2 = "percent.mt", group.by = "miQC.keep")

Session Information

## R version 4.0.4 (2021-02-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C              LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] pbmc3k.SeuratData_3.1.4 flexmix_2.3-17          lattice_0.20-41         SeuratWrappers_0.3.0   
## [5] SeuratData_0.2.1        SeuratObject_4.0.1      Seurat_4.0.1           
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15            colorspace_2.0-2      deldir_0.2-10         modeltools_0.2-23     ellipsis_0.3.2       
##   [6] ggridges_0.5.3        rprojroot_2.0.2       spatstat.data_2.1-0   farver_2.1.0          leiden_0.3.7         
##  [11] listenv_0.8.0         remotes_2.3.0         ggrepel_0.9.1         fansi_0.5.0           R.methodsS3_1.8.1    
##  [16] codetools_0.2-18      splines_4.0.4         knitr_1.33            polyclip_1.10-0       jsonlite_1.7.2       
##  [21] ica_1.0-2             cluster_2.1.0         R.oo_1.24.0           png_0.1-7             uwot_0.1.10          
##  [26] shiny_1.6.0           sctransform_0.3.2     spatstat.sparse_2.0-0 BiocManager_1.30.15   compiler_4.0.4       
##  [31] httr_1.4.2            Matrix_1.3-3          fastmap_1.1.0         lazyeval_0.2.2        cli_3.0.1            
##  [36] later_1.2.0           formatR_1.9           htmltools_0.5.1.1     prettyunits_1.1.1     tools_4.0.4          
##  [41] rsvd_1.0.5            igraph_1.2.6          gtable_0.3.0          glue_1.4.2            RANN_2.6.1           
##  [46] reshape2_1.4.4        dplyr_1.0.6           rappdirs_0.3.3        Rcpp_1.0.6            scattermore_0.7      
##  [51] jquerylib_0.1.4       vctrs_0.3.8           nlme_3.1-152          lmtest_0.9-38         xfun_0.23            
##  [56] stringr_1.4.0         globals_0.14.0        ps_1.6.0              mime_0.10             miniUI_0.1.1.1       
##  [61] lifecycle_1.0.0       irlba_2.3.3           goftest_1.2-2         future_1.21.0         MASS_7.3-53          
##  [66] zoo_1.8-9             scales_1.1.1          spatstat.core_2.1-2   promises_1.2.0.1      spatstat.utils_2.1-0 
##  [71] parallel_4.0.4        RColorBrewer_1.1-2    yaml_2.2.1            curl_4.3.1            reticulate_1.20      
##  [76] pbapply_1.4-3         gridExtra_2.3         ggplot2_3.3.5         sass_0.4.0            rpart_4.1-15         
##  [81] stringi_1.6.2         highr_0.9             pkgbuild_1.2.0        rlang_0.4.11          pkgconfig_2.0.3      
##  [86] matrixStats_0.59.0    evaluate_0.14         tensor_1.5            ROCR_1.0-11           purrr_0.3.4          
##  [91] labeling_0.4.2        patchwork_1.1.1       htmlwidgets_1.5.3     cowplot_1.1.1         processx_3.5.2       
##  [96] tidyselect_1.1.1      parallelly_1.25.0     RcppAnnoy_0.0.18      plyr_1.8.6            magrittr_2.0.1       
## [101] R6_2.5.0              generics_0.1.0        mgcv_1.8-33           pillar_1.6.1          withr_2.4.2          
## [106] fitdistrplus_1.1-3    nnet_7.3-15           abind_1.4-5           survival_3.2-7        tibble_3.1.2         
## [111] future.apply_1.7.0    crayon_1.4.1          KernSmooth_2.23-18    utf8_1.2.1            spatstat.geom_2.1-0  
## [116] plotly_4.9.3          rmarkdown_2.8         grid_4.0.4            data.table_1.14.0     callr_3.7.0          
## [121] digest_0.6.27         xtable_1.8-4          tidyr_1.1.3           httpuv_1.6.1          R.utils_2.10.1       
## [126] stats4_4.0.4          munsell_0.5.0         viridisLite_0.4.0     bslib_0.2.5.1
================================================ FILE: docs/miQC.md ================================================ Running miQC on Seurat objects ================ Compiled: July 19, 2021 This vigettte demonstrates the use of the miQC package in Seurat. Vignette is based off of the [miQC vignette](https://github.com/greenelab/miQC). If you use miQC in your work, please cite: > *miQC: An adaptive probabilistic framework for quality control of > single-cell RNA-sequencing data* > > Ariel A. Hippen, Matias M. Falco, Lukas M. Weber, Erdogan Pekcan > Erkan, Kaiyang Zhang, Jennifer Anne Doherty, Anna Vähärautio, Casey S. > Greene, Stephanie C. Hicks > > bioRxiv, 2021 > > doi: > [10.1101/2021.03.03.433798](https://www.biorxiv.org/content/10.1101/2021.03.03.433798v1) > > GitHub: Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [SeuratData](https://github.com/satijalab/seurat-data) - [flexmix](https://cran.r-project.org/web/packages/flexmix/index.html) which is wrapped by the [miQC](https://github.com/greenelab/miQC) package. - *At this point, the miQC algorithm has been adapted for use in Seurat through installation of flexmix only*. ``` r library(Seurat) library(SeuratData) library(SeuratWrappers) library(flexmix) ``` ## Introduction This vignette provides a basic example of how to run miQC, which allows users to perform cell-wise filtering of single-cell RNA-seq data for quality control. Single-cell RNA-seq data is very sensitive to tissue quality and choice of experimental workflow; it’s critical to ensure compromised cells and failed cell libraries are removed. A high proportion of reads mapping to mitochondrial DNA is one sign of a damaged cell, so most analyses will remove cells with mtRNA over a certain threshold, but those thresholds can be arbitrary and/or detrimentally stringent, especially for archived tumor tissues. miQC jointly models both the proportion of reads mapping to mtDNA genes and the number of detected genes with mixture models in a probabilistic framework to identify the low-quality cells in a given dataset. ## Example data To demonstrate how to run miQC on a single-cell RNA-seq dataset, we’ll use the `pbmc3k`dataset from the SeuratData package. ``` r InstallData("pbmc3k") data("pbmc3k") pbmc3k ``` ## An object of class Seurat ## 13714 features across 2700 samples within 1 assay ## Active assay: RNA (13714 features, 0 variable features) ## Seurat preprocessing *miQC* requires two QC metrics for each single cell dataset: (1) the number of unique genes detected per cell and (2) the percent mitochondrial reads. The number of unique genes detected per cell are typically calculated and stored automatically as metadata (*nFeature\_RNA*) upon creation of a Seurat object with `CreateSeuratObject`. In order to calculate the percent mitochondrial reads in a cell we can use `PercentageFeatureSet`. Human mitochondrial genes start with *MT-* (and *mt-* for murine genes). For other IDs, we recommend using a *biomaRt* query to map to chromosomal location and identify all mitochondrial genes. We add this as metadata here to the Seurat object as `"percent.mt"`. ``` r pbmc3k[["percent.mt"]] <- PercentageFeatureSet(object = pbmc3k, pattern = "^MT-") ``` ## miQC We can visually inspect the `"percent.mt"` and `"nFeature_RNA"` values in the `pbmc3k` dataset. ``` r FeatureScatter(pbmc3k, feature1 = "nFeature_RNA", feature2 = "percent.mt") ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/miQC_files/figure-gfm/unnamed-chunk-2-1.png) We can see that most cells have a fairly low proportion of mitochondrial reads, given that the graph is much denser at the bottom. We likely have many cells that are intact and biologically meaningful. There are also a few cells that have almost half of their reads mapping to mitochondrial genes, which are likely broken or otherwise compromised and we will want to exclude from our downstream analysis. However, it’s not clear what boundaries to draw to separate the two groups of cells. With that in mind, we’ll generate a linear mixture model using the `RunMiQC` function. The linear mixture model will be stored in the `misc` slot of the Seurat object as `"flexmix_model"`. ``` r pbmc3k <- RunMiQC(pbmc3k, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", posterior.cutoff = 0.75, model.slot = "flexmix_model") ``` This function is a wrapper for *flexmix*, which fits a mixture model on our data and returns the parameters of the two lines that best fit the data, as well as the posterior probability of each cell being derived from each distribution. We can look at the parameters and posterior values directly with the functions ``` r flexmix::parameters(Misc(pbmc3k, "flexmix_model")) ``` ## Comp.1 Comp.2 ## coef.(Intercept) 2.005173e+00 7.144404951 ## coef.nFeature_RNA 3.205993e-05 -0.004140063 ## sigma 7.410023e-01 2.122227100 ``` r head(flexmix::posterior(Misc(pbmc3k, "flexmix_model"))) ``` ## [,1] [,2] ## [1,] 0.9288831 0.07111692 ## [2,] 0.7604008 0.23959917 ## [3,] 0.9196259 0.08037411 ## [4,] 0.9711273 0.02887266 ## [5,] 0.9873905 0.01260947 ## [6,] 0.9782491 0.02175086 Or we can visualize the model results using the *PlotMiQC* function, where `"miQC.probability"` represents the posterior probability of the cell belonging to the compromised condition: ``` r PlotMiQC(pbmc3k, color.by = "miQC.probability") + ggplot2::scale_color_gradient(low = "grey", high = "purple") ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/miQC_files/figure-gfm/unnamed-chunk-5-1.png) As expected, the cells at the very top of the graph are almost certainly compromised, most likely to have been derived from the distribution with fewer unique genes and higher baseline mitochondrial expression. We can use these posterior probabilities to choose which cells to keep, and visualize the consequences of this filtering with the *PlotMiQC* function. Recall when running `"RunMiQC"` we set the `"posterior.cutoff"` to be 0.75. ``` r PlotMiQC(pbmc3k, color.by = "miQC.keep") ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/miQC_files/figure-gfm/unnamed-chunk-6-1.png) To actually perform the filtering and remove the indicated cells from our Seurat object, we can subset the Seurat object parameter as such: ``` r pbmc3k_filtered <- subset(pbmc3k, miQC.keep == "keep") pbmc3k_filtered ``` ## An object of class Seurat ## 13714 features across 2593 samples within 1 assay ## Active assay: RNA (13714 features, 0 variable features) ## Extras In most cases, a linear mixture model will be satisfactory as well as simplest, but *RunMiQC* also supports some non-linear mixture models: currently polynomials and b-splines. A user should only need to change the *model.type* parameter when making the model, and all visualization and filtering functions will work the same as with a linear model. ``` r pbmc3k <- RunMiQC(pbmc3k, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", posterior.cutoff = 0.75, model.slot = "flexmix_model", model.type = "spline") PlotMiQC(pbmc3k, color.by = "miQC.keep") ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/miQC_files/figure-gfm/unnamed-chunk-8-1.png) Also, *RunMiQC* defaults to removing any cell with 75% or greater posterior probability of being compromised, but if we want to be more or less stringent, we can alter the *posterior.cutoff* parameter, like so: ``` r pbmc3k <- RunMiQC(pbmc3k, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", posterior.cutoff = 0.9, model.slot = "flexmix_model") PlotMiQC(pbmc3k, color.by = "miQC.keep") ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/miQC_files/figure-gfm/unnamed-chunk-9-1.png) Note that when performing miQC multiple times on different samples for the same experiment, it’s recommended to select the same *posterior\_cutoff* for all, to give consistency in addition to the flexibility of sample-specific models. ## When not to use miQC The miQC model is based on the assumption that there are a non-trivial number of compromised cells in the dataset, which is not true in all datasets. We recommend using *FeatureScatter* on a dataset before running miQC to see if the two-distribution model is appropriate. Look for the distinctive triangular shape where cells have a wide variety of mitochondrial percentages at lower gene counts and taper off to lower mitochondrial percentage at higher gene counts. For example of a dataset where there’s not a significant number of compromised cells, so the two-distribution assumption is not met, we simulate an extreme case using the `"pbmc3k"` dataset here. ``` r set.seed(2021) pbmc3k_extreme <- pbmc3k simulated_percent_mt <- rnorm(mean = 2.5, sd = 0.2, n = ncol(pbmc3k_extreme)) pbmc3k_extreme$percent.mt <- ifelse(pbmc3k_extreme$nFeature_RNA > 400, simulated_percent_mt, pbmc3k_extreme$percent.mt) simulated_percent_mt_2 <- runif(min = 0, max = 60, n = ncol(pbmc3k_extreme)) pbmc3k_extreme$percent.mt <- ifelse(pbmc3k_extreme$nFeature_RNA < 400, simulated_percent_mt_2, pbmc3k_extreme$percent.mt) FeatureScatter(pbmc3k_extreme, feature1 = "nFeature_RNA", feature2 = "percent.mt") ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/miQC_files/figure-gfm/unnamed-chunk-10-1.png) The *RunMiQC* function will throw a warning if only one distribution is found. In these cases, we recommend using other filtering methods, such as a cutoff on mitochondrial percentage or percentile using the `"backup.option"` parameter to one of `"c("percentile", "percent", "pass", "halt")`. ``` r pbmc3k_extreme <- RunMiQC(pbmc3k_extreme, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", posterior.cutoff = 0.9, model.slot = "flexmix_model", backup.option = "percentile", backup.percentile = 0.95) ``` ## Warning in RunMiQC(pbmc3k_extreme, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", : flexmix returned only 1 ## cluster ## defaulting to backup.percentile for filtering ## Warning: Adding a command log without an assay associated with it ``` r FeatureScatter(pbmc3k_extreme, feature1 = "nFeature_RNA", feature2 = "percent.mt", group.by = "miQC.keep") ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/miQC_files/figure-gfm/unnamed-chunk-11-1.png) # Session Information ## R version 4.0.4 (2021-02-15) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 20.04 LTS ## ## Matrix products: default ## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so ## ## locale: ## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 ## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C LC_PAPER=en_US.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] pbmc3k.SeuratData_3.1.4 flexmix_2.3-17 lattice_0.20-41 SeuratWrappers_0.3.0 ## [5] SeuratData_0.2.1 SeuratObject_4.0.1 Seurat_4.0.1 ## ## loaded via a namespace (and not attached): ## [1] Rtsne_0.15 colorspace_2.0-2 deldir_0.2-10 modeltools_0.2-23 ellipsis_0.3.2 ## [6] ggridges_0.5.3 rprojroot_2.0.2 spatstat.data_2.1-0 farver_2.1.0 leiden_0.3.7 ## [11] listenv_0.8.0 remotes_2.3.0 ggrepel_0.9.1 fansi_0.5.0 R.methodsS3_1.8.1 ## [16] codetools_0.2-18 splines_4.0.4 knitr_1.33 polyclip_1.10-0 jsonlite_1.7.2 ## [21] ica_1.0-2 cluster_2.1.0 R.oo_1.24.0 png_0.1-7 uwot_0.1.10 ## [26] shiny_1.6.0 sctransform_0.3.2 spatstat.sparse_2.0-0 BiocManager_1.30.15 compiler_4.0.4 ## [31] httr_1.4.2 Matrix_1.3-3 fastmap_1.1.0 lazyeval_0.2.2 cli_3.0.1 ## [36] later_1.2.0 formatR_1.9 htmltools_0.5.1.1 prettyunits_1.1.1 tools_4.0.4 ## [41] rsvd_1.0.5 igraph_1.2.6 gtable_0.3.0 glue_1.4.2 RANN_2.6.1 ## [46] reshape2_1.4.4 dplyr_1.0.6 rappdirs_0.3.3 Rcpp_1.0.6 scattermore_0.7 ## [51] jquerylib_0.1.4 vctrs_0.3.8 nlme_3.1-152 lmtest_0.9-38 xfun_0.23 ## [56] stringr_1.4.0 globals_0.14.0 ps_1.6.0 mime_0.10 miniUI_0.1.1.1 ## [61] lifecycle_1.0.0 irlba_2.3.3 goftest_1.2-2 future_1.21.0 MASS_7.3-53 ## [66] zoo_1.8-9 scales_1.1.1 spatstat.core_2.1-2 promises_1.2.0.1 spatstat.utils_2.1-0 ## [71] parallel_4.0.4 RColorBrewer_1.1-2 yaml_2.2.1 curl_4.3.1 reticulate_1.20 ## [76] pbapply_1.4-3 gridExtra_2.3 ggplot2_3.3.5 sass_0.4.0 rpart_4.1-15 ## [81] stringi_1.6.2 highr_0.9 pkgbuild_1.2.0 rlang_0.4.11 pkgconfig_2.0.3 ## [86] matrixStats_0.59.0 evaluate_0.14 tensor_1.5 ROCR_1.0-11 purrr_0.3.4 ## [91] labeling_0.4.2 patchwork_1.1.1 htmlwidgets_1.5.3 cowplot_1.1.1 processx_3.5.2 ## [96] tidyselect_1.1.1 parallelly_1.25.0 RcppAnnoy_0.0.18 plyr_1.8.6 magrittr_2.0.1 ## [101] R6_2.5.0 generics_0.1.0 mgcv_1.8-33 pillar_1.6.1 withr_2.4.2 ## [106] fitdistrplus_1.1-3 nnet_7.3-15 abind_1.4-5 survival_3.2-7 tibble_3.1.2 ## [111] future.apply_1.7.0 crayon_1.4.1 KernSmooth_2.23-18 utf8_1.2.1 spatstat.geom_2.1-0 ## [116] plotly_4.9.3 rmarkdown_2.8 grid_4.0.4 data.table_1.14.0 callr_3.7.0 ## [121] digest_0.6.27 xtable_1.8-4 tidyr_1.1.3 httpuv_1.6.1 R.utils_2.10.1 ## [126] stats4_4.0.4 munsell_0.5.0 viridisLite_0.4.0 bslib_0.2.5.1 ================================================ FILE: docs/monocle3.Rmd ================================================ --- title: "Calculating Trajectories with Monocle 3 and Seurat" output: html_document: df_print: kable theme: united github_document: html_preview: no toc: no date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' --- This vigettte demonstrates how to run trajectory inference and pseudotime calculations with Monocle 3 on Seurat objects. If you use Monocle 3, please cite: > *The single-cell transcriptional landscape of mammalian organogenesis* > > Junyue Cao, Malte Spielmann, Xiaojie Qiu, Xingfan Huang, Daniel M. Ibrahim, Andrew J. Hill, Fan Zhang, Stefan Mundlos, Lena Christiansen, Frank J. Steemers, Cole Trapnell & Jay Shendure > > doi: [10.1038/s41586-019-0969-x](https://doi.org/10.1038/s41586-019-0969-x) > > Website: https://cole-trapnell-lab.github.io/monocle3/ ```{r setup, include=FALSE} knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, fig.height = 10, fig.width = 16 ) ``` Prerequisites to install: * [Seurat](https://satijalab.org/seurat/install) * [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) * [SeuratData](https://github.com/satijalab/seurat-data) * [Monocle 3](https://cole-trapnell-lab.github.io/monocle3/docs/installation/) ```{r install_deps, echo=FALSE, results="hide"} if (!requireNamespace("monocle3", quietly = TRUE)) { setRepositories(ind = 1:2) remotes::install_github( repo = "cole-trapnell-lab/monocle3", upgrade = FALSE ) } ``` ```{r packages} library(monocle3) library(Seurat) library(SeuratData) library(SeuratWrappers) library(ggplot2) library(patchwork) library(magrittr) ``` ## HCA Bone Marrow 40k ```{r load_hca, eval=FALSE} InstallData("hcabm40k") data("hcabm40k") ``` ```{r preprocess_hca, results='hide', eval=FALSE} hcabm40k <- SplitObject(hcabm40k, split.by = "orig.ident") for (i in seq_along(hcabm40k)) { hcabm40k[[i]] <- NormalizeData(hcabm40k[[i]]) %>% FindVariableFeatures() } features <- SelectIntegrationFeatures(hcabm40k) for (i in seq_along(along.with = hcabm40k)) { hcabm40k[[i]] <- ScaleData(hcabm40k[[i]], features = features) %>% RunPCA(features = features) } ``` ```{r integration, results='hide', eval=FALSE} anchors <- FindIntegrationAnchors( hcabm40k, reference = c(1, 2), reduction = "rpca", dims = 1:30 ) integrated <- IntegrateData(anchors, dims = 1:30) ``` ```{r dimreduc_hca, results='hide', eval=FALSE} integrated <- ScaleData(integrated) integrated <- RunPCA(integrated) integrated <- RunUMAP(integrated, dims = 1:30, reduction.name = "UMAP") integrated <- FindNeighbors(integrated, dims = 1:30) integrated <- FindClusters(integrated) DimPlot(integrated, group.by = c("orig.ident", "ident")) ``` ```{r integrated_dimplot, echo=FALSE} con <- url("https://seurat.nygenome.org/monocle3/hcabm40k_integrated.Rds") integrated <- readRDS(file = con) close(con = con) DimPlot(object = integrated, group.by = c("orig.ident", "ident")) ``` ```{r monocle3_clustering, results='hide'} cds <- as.cell_data_set(integrated) cds <- cluster_cells(cds) p1 <- plot_cells(cds, show_trajectory_graph = FALSE) p2 <- plot_cells(cds, color_cells_by = "partition", show_trajectory_graph = FALSE) wrap_plots(p1, p2) ``` ```{r gc6, echo=FALSE, results="hide"} rm(integrated) gc(verbose = FALSE) ``` ```{r subset_partitions, results="hide"} integrated.sub <- subset(as.Seurat(cds), monocle3_partitions == 1) cds <- as.cell_data_set(integrated.sub) cds <- learn_graph(cds) plot_cells( cds, label_groups_by_cluster = FALSE, label_leaves = FALSE, label_branch_points = FALSE ) ``` ```{r gc7, echo=FALSE, results="hide"} gc(verbose = FALSE) ``` ```{r pseudotime, results="hide"} max.avp <- which.max(unlist(FetchData(integrated.sub, "AVP"))) max.avp <- colnames(integrated.sub)[max.avp] cds <- order_cells(cds, root_cells = max.avp) plot_cells( cds, color_cells_by = "pseudotime", label_cell_groups = FALSE, label_leaves = FALSE, label_branch_points = FALSE ) # Set the assay back as "integrated" integrated.sub <- as.Seurat(cds, assay = "integrated") FeaturePlot(integrated.sub, "monocle3_pseudotime") ``` ================================================ FILE: docs/monocle3.html ================================================ Calculating Trajectories with Monocle 3 and Seurat

This vigettte demonstrates how to run trajectory inference and pseudotime calculations with Monocle 3 on Seurat objects. If you use Monocle 3, please cite:

The single-cell transcriptional landscape of mammalian organogenesis

Junyue Cao, Malte Spielmann, Xiaojie Qiu, Xingfan Huang, Daniel M. Ibrahim, Andrew J. Hill, Fan Zhang, Stefan Mundlos, Lena Christiansen, Frank J. Steemers, Cole Trapnell & Jay Shendure

doi: 10.1038/s41586-019-0969-x

Website: https://cole-trapnell-lab.github.io/monocle3/

Prerequisites to install:

library(monocle3)
library(Seurat)
library(SeuratData)
library(SeuratWrappers)
library(ggplot2)
library(patchwork)
library(magrittr)

HCA Bone Marrow 40k

InstallData("hcabm40k")
data("hcabm40k")
hcabm40k <- SplitObject(hcabm40k, split.by = "orig.ident")
for (i in seq_along(hcabm40k)) {
    hcabm40k[[i]] <- NormalizeData(hcabm40k[[i]]) %>% FindVariableFeatures()
}
features <- SelectIntegrationFeatures(hcabm40k)
for (i in seq_along(along.with = hcabm40k)) {
    hcabm40k[[i]] <- ScaleData(hcabm40k[[i]], features = features) %>% RunPCA(features = features)
}
anchors <- FindIntegrationAnchors(hcabm40k, reference = c(1, 2), reduction = "rpca", dims = 1:30)
integrated <- IntegrateData(anchors, dims = 1:30)
integrated <- ScaleData(integrated)
integrated <- RunPCA(integrated)
integrated <- RunUMAP(integrated, dims = 1:30, reduction.name = "UMAP")
integrated <- FindNeighbors(integrated, dims = 1:30)
integrated <- FindClusters(integrated)
DimPlot(integrated, group.by = c("orig.ident", "ident"))

cds <- as.cell_data_set(integrated)
cds <- cluster_cells(cds)
p1 <- plot_cells(cds, show_trajectory_graph = FALSE)
p2 <- plot_cells(cds, color_cells_by = "partition", show_trajectory_graph = FALSE)
wrap_plots(p1, p2)

integrated.sub <- subset(as.Seurat(cds), monocle3_partitions == 1)
cds <- as.cell_data_set(integrated.sub)
cds <- learn_graph(cds)
plot_cells(cds, label_groups_by_cluster = FALSE, label_leaves = FALSE, label_branch_points = FALSE)

max.avp <- which.max(unlist(FetchData(integrated.sub, "AVP")))
max.avp <- colnames(integrated.sub)[max.avp]
cds <- order_cells(cds, root_cells = max.avp)
plot_cells(cds, color_cells_by = "pseudotime", label_cell_groups = FALSE, label_leaves = FALSE, 
    label_branch_points = FALSE)

# Set the assay back as 'integrated'
integrated.sub <- as.Seurat(cds, assay = "integrated")
FeaturePlot(integrated.sub, "monocle3_pseudotime")

================================================ FILE: docs/monocle3.md ================================================ Calculating Trajectories with Monocle 3 and Seurat ================ Compiled: June 17, 2020 This vigettte demonstrates how to run trajectory inference and pseudotime calculations with Monocle 3 on Seurat objects. If you use Monocle 3, please cite: > *The single-cell transcriptional landscape of mammalian organogenesis* > > Junyue Cao, Malte Spielmann, Xiaojie Qiu, Xingfan Huang, Daniel M. > Ibrahim, Andrew J. Hill, Fan Zhang, Stefan Mundlos, Lena Christiansen, > Frank J. Steemers, Cole Trapnell & Jay Shendure > > doi: > [10.1038/s41586-019-0969-x](https://doi.org/10.1038/s41586-019-0969-x) > > Website: Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) - [SeuratData](https://github.com/satijalab/seurat-data) - [Monocle 3](https://cole-trapnell-lab.github.io/monocle3/docs/installation/) ``` r library(monocle3) library(Seurat) library(SeuratData) library(SeuratWrappers) library(ggplot2) library(patchwork) library(magrittr) ``` ## HCA Bone Marrow 40k ``` r InstallData("hcabm40k") data("hcabm40k") ``` ``` r hcabm40k <- SplitObject(hcabm40k, split.by = "orig.ident") for (i in seq_along(hcabm40k)) { hcabm40k[[i]] <- NormalizeData(hcabm40k[[i]]) %>% FindVariableFeatures() } features <- SelectIntegrationFeatures(hcabm40k) for (i in seq_along(along.with = hcabm40k)) { hcabm40k[[i]] <- ScaleData(hcabm40k[[i]], features = features) %>% RunPCA(features = features) } ``` ``` r anchors <- FindIntegrationAnchors(hcabm40k, reference = c(1, 2), reduction = "rpca", dims = 1:30) integrated <- IntegrateData(anchors, dims = 1:30) ``` ``` r integrated <- ScaleData(integrated) integrated <- RunPCA(integrated) integrated <- RunUMAP(integrated, dims = 1:30, reduction.name = "UMAP") integrated <- FindNeighbors(integrated, dims = 1:30) integrated <- FindClusters(integrated) DimPlot(integrated, group.by = c("orig.ident", "ident")) ``` ![](monocle3_files/figure-gfm/integrated_dimplot-1.png) ``` r cds <- as.cell_data_set(integrated) cds <- cluster_cells(cds) p1 <- plot_cells(cds, show_trajectory_graph = FALSE) p2 <- plot_cells(cds, color_cells_by = "partition", show_trajectory_graph = FALSE) wrap_plots(p1, p2) ``` ![](monocle3_files/figure-gfm/monocle3_clustering-1.png) ``` r integrated.sub <- subset(as.Seurat(cds), monocle3_partitions == 1) cds <- as.cell_data_set(integrated.sub) cds <- learn_graph(cds) plot_cells(cds, label_groups_by_cluster = FALSE, label_leaves = FALSE, label_branch_points = FALSE) ``` ![](monocle3_files/figure-gfm/subset_partitions-1.png) ``` r max.avp <- which.max(unlist(FetchData(integrated.sub, "AVP"))) max.avp <- colnames(integrated.sub)[max.avp] cds <- order_cells(cds, root_cells = max.avp) plot_cells(cds, color_cells_by = "pseudotime", label_cell_groups = FALSE, label_leaves = FALSE, label_branch_points = FALSE) ``` ![](monocle3_files/figure-gfm/pseudotime-1.png) ``` r integrated.sub <- as.Seurat(cds) FeaturePlot(integrated.sub, "monocle3_pseudotime") ``` ![](monocle3_files/figure-gfm/pseudotime-2.png) ================================================ FILE: docs/nebulosa.Rmd ================================================ --- title: "Visualization of gene expression with Nebulosa" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: github_document: html_preview: true toc: true toc_depth: 3 html_document: df_print: kable theme: united --- ```{r r setup, include=FALSE} knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE ) ``` ```{r install_nebulosa, include=FALSE} if (!requireNamespace("Nebulosa", quietly = TRUE)) { remotes::install_github( repo = "powellgenomicslab/Nebulosa", dependencies = TRUE, upgrade = FALSE ) } ``` This vignette demonstrates how to run Nebulosa on a Seurat object. If you use this, please cite: > *Nebulosa recovers single cell gene expression signals by kernel density estimation* > > Jose Alquicira-Hernandez and Joseph E. Powell > > (Under review), 2020. > > doi: [10.18129](10.18129/B9.bioc.Nebulosa) > > Website: https://github.com/powellgenomicslab/Nebulosa # Overview Due to the sparsity observed in single-cell data (e.g. RNA-seq, ATAC-seq), the visualization of cell features (e.g. gene, peak) is frequently affected and unclear, especially when it is overlaid with clustering to annotate cell types. `Nebulosa` is an R package to visualize data from single cells based on kernel density estimation. It aims to recover the signal from dropped-out features by incorporating the similarity between cells allowing a "convolution" of the cell features. # Import libraries For this vignette, let's use `Nebulosa` with the `Seurat` package. First, we'll do a brief/standard data processing. ```{r import_libraries} library("Nebulosa") library("Seurat") library("BiocFileCache") ``` # Data pre-processing Let's download a dataset of 3k PBMCs (available from 10X Genomics). This same dataset is commonly used in Seurat vignettes. The code below will download, store, and uncompress the data in a temporary directory. ```{r download_and_untar_file} bfc <- BiocFileCache(ask = FALSE) data_file <- bfcrpath(bfc, file.path( "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell", "pbmc3k", "pbmc3k_filtered_gene_bc_matrices.tar.gz" )) untar(data_file, exdir = tempdir()) ``` Then, we can read the gene expression matrix using the `Read10X` from `Seurat` ```{r read_data} data <- Read10X(data.dir = file.path(tempdir(), "filtered_gene_bc_matrices", "hg19" )) ``` Let's create a Seurat object with features being expressed in at least 3 cells and cells expressing at least 200 genes. ```{r create_seurat_object} pbmc <- CreateSeuratObject( counts = data, project = "pbmc3k", min.cells = 3, min.features = 200 ) ``` Remove outlier cells based on the number of genes being expressed in each cell (below 2500 genes) and expression of mitochondrial genes (below 5%). ```{r qc} pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") pbmc <- subset(pbmc, subset = nFeature_RNA < 2500 & percent.mt < 5) ``` ## Data normalization Let's use `SCTransform` to stabilize the variance of the data by regressing out the effect of the sequencing depth from each cell. ```{r norm, message=FALSE, warning=FALSE} pbmc <- SCTransform(pbmc, verbose = FALSE) ``` ## Dimensionality reduction Once the data is normalized and scaled, we can run a _Principal Component Analysis_ (PCA) first to reduce the dimensions of our data from 26286 features to 50 principal components. To visualize the principal components, we can run a _Uniform Manifold Approximation and Projection for Dimension Reduction_ (UMAP) using the first 30 principal components to obtain a two-dimentional space. ```{r dim_red, message=FALSE, warning=FALSE} pbmc <- RunPCA(pbmc) pbmc <- RunUMAP(pbmc, dims = 1:30) ``` ## Clustering To assess cell similarity, let's cluster the data by constructing a _Shared Nearest Neighbor _(SNN) _Graph_ using the first 30 principal components and applying the _Louvain algorithm_. ```{r clustering, message=FALSE, warning=FALSE} pbmc <- FindNeighbors(pbmc, dims = 1:30) pbmc <- FindClusters(pbmc) ``` # Visualize data with `Nebulosa` The main function from `Nebulosa` is the `plot_density`. For usability, it resembles the `FeaturePlot` function from `Seurat`. Let's plot the kernel density estimate for `CD4` as follows ```{r plot_cd4} plot_density(pbmc, "CD4") ``` For comparison, let's also plot a standard scatterplot using `Seurat` ```{r cd4_comparison} FeaturePlot(pbmc, "CD4") FeaturePlot(pbmc, "CD4", order = TRUE) ``` By smoothing the data, `Nebulosa` allows a better visualization of the global expression of CD4 in myeloid and CD4+ T cells. Notice that the "random" expression of CD4 in other areas of the plot is removed as the expression of this gene is not supported by many cells in those areas. Furthermore, CD4+ cells appear to show considerable dropout rate. Let's plot the expression of CD4 with `Nebulosa` next to the clustering results ```{r cd4_and_clustering} DimPlot(pbmc, label = TRUE, repel = TRUE) ``` We can now easily identify that clusters `0` and `2` correspond to CD4+ T cells if we plot CD3D too. ```{r plot_cd3d} plot_density(pbmc, "CD3D") ``` # Multi-feature visualization Characterize cell populations usually relies in more than a single marker. Nebulosa allows the visualization of the joint density of from multiple features in a single plot. ## Identifying Naive CD8+ T cells Users familiarized with PBMC datasets may know that CD8+ CCR7+ cells usually cluster next to CD4+ CCR7+ and separate from the rest of CD8+ cells. Let's aim to identify Naive CD8+ T cells. To do so, we can just add another gene to the vector containing the features to visualize. ```{r fig.height=10} p3 <- plot_density(pbmc, c("CD8A", "CCR7")) p3 + plot_layout(ncol = 1) ``` `Nebulosa` can return a *joint density* plot by multiplying the densities from all query genes by using the `joint = TRUE` parameter: ```{r fig.height=14} p4 <- plot_density(pbmc, c("CD8A", "CCR7"), joint = TRUE) p4 + plot_layout(ncol = 1) ``` When compared to the clustering results, we can easily identify that Naive CD8+ T cells correspond to cluster `8`. `Nebulosa` returns the density estimates for each gene along with the joint density across all provided genes. By setting `combine = FALSE`, we can obtain a list of ggplot objects where the last plot corresponds to the joint density estimate. ```{r} p_list <- plot_density(pbmc, c("CD8A", "CCR7"), joint = TRUE, combine = FALSE) p_list[[length(p_list)]] ``` ## Identifying Naive CD4+ T cells Likewise, the identification of Naive CD4+ T cells becomes straightforward by combining `CD4` and `CCR7`: ```{r fig.height=14} p4 <- plot_density(pbmc, c("CD4", "CCR7"), joint = TRUE) p4 + plot_layout(ncol = 1) ``` Notice that these cells are mainly constrained to cluster `0` ```{r fig.height=10} p4[[3]] / DimPlot(pbmc, label = TRUE, repel = TRUE) ``` # Conclusions In summary,`Nebulosa`can be useful to recover the signal from dropped-out genes and improve their visualization in a two-dimensional space. We recommend using `Nebulosa` particularly for dropped-out genes. For fairly well-expressed genes, the direct visualization of the gene expression may be preferable. We encourage users to use `Nebulosa` along with the core visualization methods from the `Seurat` and `Bioconductor` environments as well as other visualization methods to draw more informed conclusions about their data. ================================================ FILE: docs/nebulosa.html ================================================ Visualization of gene expression with Nebulosa

This vignette demonstrates how to run Nebulosa on a Seurat object. If you use this, please cite:

Nebulosa recovers single cell gene expression signals by kernel density estimation

Jose Alquicira-Hernandez and Joseph E. Powell

(Under review), 2020.

doi: 10.18129

Website: https://github.com/powellgenomicslab/Nebulosa

Overview

Due to the sparsity observed in single-cell data (e.g. RNA-seq, ATAC-seq), the visualization of cell features (e.g. gene, peak) is frequently affected and unclear, especially when it is overlaid with clustering to annotate cell types. Nebulosa is an R package to visualize data from single cells based on kernel density estimation. It aims to recover the signal from dropped-out features by incorporating the similarity between cells allowing a “convolution” of the cell features.

Import libraries

For this vignette, let’s use Nebulosa with the Seurat package. First, we’ll do a brief/standard data processing.

library("Nebulosa")
library("Seurat")
library("BiocFileCache")

Data pre-processing

Let’s download a dataset of 3k PBMCs (available from 10X Genomics). This same dataset is commonly used in Seurat vignettes. The code below will download, store, and uncompress the data in a temporary directory.

bfc <- BiocFileCache(ask = FALSE)
data_file <- bfcrpath(bfc, file.path("https://s3-us-west-2.amazonaws.com/10x.files/samples/cell", 
    "pbmc3k", "pbmc3k_filtered_gene_bc_matrices.tar.gz"))

untar(data_file, exdir = tempdir())

Then, we can read the gene expression matrix using the Read10X from Seurat

data <- Read10X(data.dir = file.path(tempdir(), "filtered_gene_bc_matrices", "hg19"))

Let’s create a Seurat object with features being expressed in at least 3 cells and cells expressing at least 200 genes.

pbmc <- CreateSeuratObject(counts = data, project = "pbmc3k", min.cells = 3, min.features = 200)

Remove outlier cells based on the number of genes being expressed in each cell (below 2500 genes) and expression of mitochondrial genes (below 5%).

pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
pbmc <- subset(pbmc, subset = nFeature_RNA < 2500 & percent.mt < 5)

Data normalization

Let’s use SCTransform to stabilize the variance of the data by regressing out the effect of the sequencing depth from each cell.

pbmc <- SCTransform(pbmc, verbose = FALSE)

Dimensionality reduction

Once the data is normalized and scaled, we can run a Principal Component Analysis (PCA) first to reduce the dimensions of our data from 26286 features to 50 principal components. To visualize the principal components, we can run a Uniform Manifold Approximation and Projection for Dimension Reduction (UMAP) using the first 30 principal components to obtain a two-dimentional space.

pbmc <- RunPCA(pbmc)
pbmc <- RunUMAP(pbmc, dims = 1:30)

Clustering

To assess cell similarity, let’s cluster the data by constructing a Shared Nearest Neighbor (SNN) Graph using the first 30 principal components and applying the Louvain algorithm.

pbmc <- FindNeighbors(pbmc, dims = 1:30)
pbmc <- FindClusters(pbmc)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2638
## Number of edges: 113368
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8272
## Number of communities: 13
## Elapsed time: 0 seconds

Visualize data with Nebulosa

The main function from Nebulosa is the plot_density. For usability, it resembles the FeaturePlot function from Seurat.

Let’s plot the kernel density estimate for CD4 as follows

plot_density(pbmc, "CD4")

For comparison, let’s also plot a standard scatterplot using Seurat

FeaturePlot(pbmc, "CD4")

FeaturePlot(pbmc, "CD4", order = TRUE)

By smoothing the data, Nebulosa allows a better visualization of the global expression of CD4 in myeloid and CD4+ T cells. Notice that the “random” expression of CD4 in other areas of the plot is removed as the expression of this gene is not supported by many cells in those areas. Furthermore, CD4+ cells appear to show considerable dropout rate.

Let’s plot the expression of CD4 with Nebulosa next to the clustering results

DimPlot(pbmc, label = TRUE, repel = TRUE)

We can now easily identify that clusters 0 and 2 correspond to CD4+ T cells if we plot CD3D too.

plot_density(pbmc, "CD3D")

Multi-feature visualization

Characterize cell populations usually relies in more than a single marker. Nebulosa allows the visualization of the joint density of from multiple features in a single plot.

Identifying Naive CD8+ T cells

Users familiarized with PBMC datasets may know that CD8+ CCR7+ cells usually cluster next to CD4+ CCR7+ and separate from the rest of CD8+ cells. Let’s aim to identify Naive CD8+ T cells. To do so, we can just add another gene to the vector containing the features to visualize.

p3 <- plot_density(pbmc, c("CD8A", "CCR7"))
p3 + plot_layout(ncol = 1)

Nebulosa can return a joint density plot by multiplying the densities from all query genes by using the joint = TRUE parameter:

p4 <- plot_density(pbmc, c("CD8A", "CCR7"), joint = TRUE)
p4 + plot_layout(ncol = 1)

When compared to the clustering results, we can easily identify that Naive CD8+ T cells correspond to cluster 8.

Nebulosa returns the density estimates for each gene along with the joint density across all provided genes. By setting combine = FALSE, we can obtain a list of ggplot objects where the last plot corresponds to the joint density estimate.

p_list <- plot_density(pbmc, c("CD8A", "CCR7"), joint = TRUE, combine = FALSE)
p_list[[length(p_list)]]

Identifying Naive CD4+ T cells

Likewise, the identification of Naive CD4+ T cells becomes straightforward by combining CD4 and CCR7:

p4 <- plot_density(pbmc, c("CD4", "CCR7"), joint = TRUE)
p4 + plot_layout(ncol = 1)

Notice that these cells are mainly constrained to cluster 0

p4[[3]]/DimPlot(pbmc, label = TRUE, repel = TRUE)

Conclusions

In summary,Nebulosacan be useful to recover the signal from dropped-out genes and improve their visualization in a two-dimensional space. We recommend using Nebulosa particularly for dropped-out genes. For fairly well-expressed genes, the direct visualization of the gene expression may be preferable. We encourage users to use Nebulosa along with the core visualization methods from the Seurat and Bioconductor environments as well as other visualization methods to draw more informed conclusions about their data.

================================================ FILE: docs/nebulosa.md ================================================ Visualization of gene expression with Nebulosa ================ Compiled: September 29, 2020 This vignette demonstrates how to run Nebulosa on a Seurat object. If you use this, please cite: > *Nebulosa recovers single cell gene expression signals by kernel > density estimation* > > Jose Alquicira-Hernandez and Joseph E. Powell > > (Under review), 2020. > > doi: [10.18129](10.18129/B9.bioc.Nebulosa) > > Website: # Overview Due to the sparsity observed in single-cell data (e.g. RNA-seq, ATAC-seq), the visualization of cell features (e.g. gene, peak) is frequently affected and unclear, especially when it is overlaid with clustering to annotate cell types. `Nebulosa` is an R package to visualize data from single cells based on kernel density estimation. It aims to recover the signal from dropped-out features by incorporating the similarity between cells allowing a “convolution” of the cell features. # Import libraries For this vignette, let’s use `Nebulosa` with the `Seurat` package. First, we’ll do a brief/standard data processing. ``` r library("Nebulosa") library("Seurat") library("BiocFileCache") ``` # Data pre-processing Let’s download a dataset of 3k PBMCs (available from 10X Genomics). This same dataset is commonly used in Seurat vignettes. The code below will download, store, and uncompress the data in a temporary directory. ``` r bfc <- BiocFileCache(ask = FALSE) data_file <- bfcrpath(bfc, file.path("https://s3-us-west-2.amazonaws.com/10x.files/samples/cell", "pbmc3k", "pbmc3k_filtered_gene_bc_matrices.tar.gz")) untar(data_file, exdir = tempdir()) ``` Then, we can read the gene expression matrix using the `Read10X` from `Seurat` ``` r data <- Read10X(data.dir = file.path(tempdir(), "filtered_gene_bc_matrices", "hg19")) ``` Let’s create a Seurat object with features being expressed in at least 3 cells and cells expressing at least 200 genes. ``` r pbmc <- CreateSeuratObject(counts = data, project = "pbmc3k", min.cells = 3, min.features = 200) ``` Remove outlier cells based on the number of genes being expressed in each cell (below 2500 genes) and expression of mitochondrial genes (below 5%). ``` r pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") pbmc <- subset(pbmc, subset = nFeature_RNA < 2500 & percent.mt < 5) ``` ## Data normalization Let’s use `SCTransform` to stabilize the variance of the data by regressing out the effect of the sequencing depth from each cell. ``` r pbmc <- SCTransform(pbmc, verbose = FALSE) ``` ## Dimensionality reduction Once the data is normalized and scaled, we can run a *Principal Component Analysis* (PCA) first to reduce the dimensions of our data from 26286 features to 50 principal components. To visualize the principal components, we can run a *Uniform Manifold Approximation and Projection for Dimension Reduction* (UMAP) using the first 30 principal components to obtain a two-dimentional space. ``` r pbmc <- RunPCA(pbmc) pbmc <- RunUMAP(pbmc, dims = 1:30) ``` ## Clustering To assess cell similarity, let’s cluster the data by constructing a *Shared Nearest Neighbor *(SNN) *Graph* using the first 30 principal components and applying the *Louvain algorithm*. ``` r pbmc <- FindNeighbors(pbmc, dims = 1:30) pbmc <- FindClusters(pbmc) ``` ## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck ## ## Number of nodes: 2638 ## Number of edges: 113368 ## ## Running Louvain algorithm... ## Maximum modularity in 10 random starts: 0.8272 ## Number of communities: 13 ## Elapsed time: 0 seconds # Visualize data with `Nebulosa` The main function from `Nebulosa` is the `plot_density`. For usability, it resembles the `FeaturePlot` function from `Seurat`. Let’s plot the kernel density estimate for `CD4` as follows ``` r plot_density(pbmc, "CD4") ``` ![](nebulosa_files/figure-gfm/plot_cd4-1.png) For comparison, let’s also plot a standard scatterplot using `Seurat` ``` r FeaturePlot(pbmc, "CD4") ``` ![](nebulosa_files/figure-gfm/cd4_comparison-1.png) ``` r FeaturePlot(pbmc, "CD4", order = TRUE) ``` ![](nebulosa_files/figure-gfm/cd4_comparison-2.png) By smoothing the data, `Nebulosa` allows a better visualization of the global expression of CD4 in myeloid and CD4+ T cells. Notice that the “random” expression of CD4 in other areas of the plot is removed as the expression of this gene is not supported by many cells in those areas. Furthermore, CD4+ cells appear to show considerable dropout rate. Let’s plot the expression of CD4 with `Nebulosa` next to the clustering results ``` r DimPlot(pbmc, label = TRUE, repel = TRUE) ``` ![](nebulosa_files/figure-gfm/cd4_and_clustering-1.png) We can now easily identify that clusters `0` and `2` correspond to CD4+ T cells if we plot CD3D too. ``` r plot_density(pbmc, "CD3D") ``` ![](nebulosa_files/figure-gfm/plot_cd3d-1.png) # Multi-feature visualization Characterize cell populations usually relies in more than a single marker. Nebulosa allows the visualization of the joint density of from multiple features in a single plot. ## Identifying Naive CD8+ T cells Users familiarized with PBMC datasets may know that CD8+ CCR7+ cells usually cluster next to CD4+ CCR7+ and separate from the rest of CD8+ cells. Let’s aim to identify Naive CD8+ T cells. To do so, we can just add another gene to the vector containing the features to visualize. ``` r p3 <- plot_density(pbmc, c("CD8A", "CCR7")) p3 + plot_layout(ncol = 1) ``` ![](nebulosa_files/figure-gfm/unnamed-chunk-1-1.png) `Nebulosa` can return a *joint density* plot by multiplying the densities from all query genes by using the `joint = TRUE` parameter: ``` r p4 <- plot_density(pbmc, c("CD8A", "CCR7"), joint = TRUE) p4 + plot_layout(ncol = 1) ``` ![](nebulosa_files/figure-gfm/unnamed-chunk-2-1.png) When compared to the clustering results, we can easily identify that Naive CD8+ T cells correspond to cluster `8`. `Nebulosa` returns the density estimates for each gene along with the joint density across all provided genes. By setting `combine = FALSE`, we can obtain a list of ggplot objects where the last plot corresponds to the joint density estimate. ``` r p_list <- plot_density(pbmc, c("CD8A", "CCR7"), joint = TRUE, combine = FALSE) p_list[[length(p_list)]] ``` ![](nebulosa_files/figure-gfm/unnamed-chunk-3-1.png) ## Identifying Naive CD4+ T cells Likewise, the identification of Naive CD4+ T cells becomes straightforward by combining `CD4` and `CCR7`: ``` r p4 <- plot_density(pbmc, c("CD4", "CCR7"), joint = TRUE) p4 + plot_layout(ncol = 1) ``` ![](nebulosa_files/figure-gfm/unnamed-chunk-4-1.png) Notice that these cells are mainly constrained to cluster `0` ``` r p4[[3]]/DimPlot(pbmc, label = TRUE, repel = TRUE) ``` ![](nebulosa_files/figure-gfm/unnamed-chunk-5-1.png) # Conclusions In summary,`Nebulosa`can be useful to recover the signal from dropped-out genes and improve their visualization in a two-dimensional space. We recommend using `Nebulosa` particularly for dropped-out genes. For fairly well-expressed genes, the direct visualization of the gene expression may be preferable. We encourage users to use `Nebulosa` along with the core visualization methods from the `Seurat` and `Bioconductor` environments as well as other visualization methods to draw more informed conclusions about their data. ================================================ FILE: docs/pacmap.Rmd ================================================ --- title: "Running PaCMAP on a Seurat Object" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: github_document: html_preview: false toc: false html_document: toc: false df_print: paged --- This vignette demonstrates how to run PaCMAP, a dimensionality reduction method that can be used for providing robust and trustworthy visualization, on a Seurat object. If you use our work, please cite both papers: > *Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization* > > Yingfan Wang, Haiyang Huang, Cynthia Rudin & Yaron Shaposhnik > > Journal of Machine Learning Research, 2021 > > doi: https://doi.org/10.48550/arXiv.2012.04456 > > *Towards a comprehensive evaluation of dimension reduction methods for transcriptomic data visualization* > > Haiyang Huang, Yingfan Wang, Cynthia Rudin and Edward P. Browne > > Communications biology, 2022 > > doi: https://doi.org/10.1038/s42003-022-03628-x > > GitHub: https://github.com/YingfanWang/PaCMAP ```{r setup, include=FALSE} knitr::opts_chunk$set( message = FALSE, warning = FALSE, fig.width = 10 ) ``` Prerequisites to install: * [Seurat](https://satijalab.org/seurat/install) * [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) * [SeuratData](https://github.com/satijalab/seurat-data) * [Reticulate](https://rstudio.github.io/reticulate/) In addition to R packages, PaCMAP relies on Python to deliver high performance. To streamline the installation process and make environment management easier, we strongly recommend you to use anaconda(https://www.anaconda.com/download) or miniconda(https://docs.anaconda.com/miniconda/miniconda-install/) for managing Python environments. Below, we provide step-by-step instructions on how to properly install PaCMAP **after** you have installed one of these tools. Create a conda environment with PaCMAP installed: ```{bash conda environment setup, eval=FALSE} conda create -n "pacmap" python=3.12 # Install in the environment called "pacmap" conda activate pacmap conda install -y conda-forge::pacmap ``` To run PaCMAP, you need to connect your R console to the corresponding conda environment. If your Conda/Miniconda installation is located in a non-default directory, you might set up the conda variable as `/path/to/your/conda`. This ensures the correct environment is used during the installation. ```{r conda environment connects, eval=FALSE} reticulate::use_condaenv(condaenv = "pacmap", conda = "auto") ``` ```{r packages} library(Seurat) library(SeuratData) library(SeuratWrappers) ``` ### PaCMAP on PBMC3k To learn more about this dataset, type `?pbmc3k` ```{r pacmap, cache=TRUE, cache.lazy=TRUE} InstallData("pbmc3k") pbmc3k.final <- LoadData("pbmc3k",type="pbmc3k.final") # Initial processing to select variable features pbmc3k.final <- UpdateSeuratObject(pbmc3k.final) pbmc3k.final <- FindVariableFeatures(pbmc3k.final) # run PaCMAP on Seurat object. pbmc3k.final <- RunPaCMAP(object = pbmc3k.final, features=VariableFeatures(pbmc3k.final)) ``` ```{r explore, fig.width=6} # visualize markers features.plot <- c('CD3D', 'MS4A1', 'CD8A', 'GZMK', 'GZMB', 'FCGR3A') DimPlot(object=pbmc3k.final,reduction="pacmap") ``` ```{r explore2, fig.height=10} pbmc3k.final <- NormalizeData(pbmc3k.final, verbose = FALSE) FeaturePlot(pbmc3k.final, features.plot, ncol = 2, reduction="pacmap") ``` You can also specify dims of your original dataset for running PaCMAP ```{r pacmap_dim, cache=TRUE, cache.lazy=TRUE} # run PaCMAP on Seurat object. pbmc3k.final <- RunPaCMAP(object = pbmc3k.final, dims=2:5) # visualize markers features.plot <- c('CD3D', 'MS4A1', 'CD8A', 'GZMK', 'GZMB', 'FCGR3A') DimPlot(object=pbmc3k.final,reduction="pacmap") ``` ================================================ FILE: docs/pacmap.html ================================================ Running PaCMAP on a Seurat Object

This vignette demonstrates how to run PaCMAP, a dimensionality reduction method that can be used for providing robust and trustworthy visualization, on a Seurat object. If you use our work, please cite both papers:

Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization

Yingfan Wang, Haiyang Huang, Cynthia Rudin & Yaron Shaposhnik

Journal of Machine Learning Research, 2021

doi: https://doi.org/10.48550/arXiv.2012.04456

Towards a comprehensive evaluation of dimension reduction methods for transcriptomic data visualization

Haiyang Huang, Yingfan Wang, Cynthia Rudin and Edward P. Browne

Communications biology, 2022

doi: https://doi.org/10.1038/s42003-022-03628-x

GitHub: https://github.com/YingfanWang/PaCMAP

Prerequisites to install:

In addition to R packages, PaCMAP relies on Python to deliver high performance. To streamline the installation process and make environment management easier, we strongly recommend you to use anaconda(https://www.anaconda.com/download) or miniconda(https://docs.anaconda.com/miniconda/miniconda-install/) for managing Python environments. Below, we provide step-by-step instructions on how to properly install PaCMAP after you have installed one of these tools.

Create a conda environment with PaCMAP installed:

conda create -n "pacmap" python=3.12  # Install in the environment called "pacmap"
conda activate pacmap
conda install -y conda-forge::pacmap

To run PaCMAP, you need to connect your R console to the corresponding conda environment. If your Conda/Miniconda installation is located in a non-default directory, you might set up the conda variable as /path/to/your/conda. This ensures the correct environment is used during the installation.

reticulate::use_condaenv(condaenv = "pacmap", conda = "auto")
library(Seurat)
library(SeuratData)
library(SeuratWrappers)

PaCMAP on PBMC3k

To learn more about this dataset, type ?pbmc3k

InstallData("pbmc3k")
pbmc3k.final <- LoadData("pbmc3k",type="pbmc3k.final")

# Initial processing to select variable features
pbmc3k.final <- UpdateSeuratObject(pbmc3k.final)
pbmc3k.final <- FindVariableFeatures(pbmc3k.final)

# run PaCMAP on Seurat object.
pbmc3k.final <- RunPaCMAP(object = pbmc3k.final, features=VariableFeatures(pbmc3k.final))
## Applied PCA, the dimensionality becomes 100
## PaCMAP(n_neighbors=10, n_MN=5, n_FP=20, distance=euclidean, lr=1.0, n_iters=(100, 100, 450), apply_pca=True, opt_method='adam', verbose=True, intermediate=False, seed=11)
## Finding pairs
## Found nearest neighbor
## Calculated sigma
## Found scaled dist
## Pairs sampled successfully.
## ((26380, 2), (13190, 2), (52760, 2))
## Initial Loss: 32494.857421875
## Iteration:   10, Loss: 25802.580078
## Iteration:   20, Loss: 21603.363281
## Iteration:   30, Loss: 19970.650391
## Iteration:   40, Loss: 18992.988281
## Iteration:   50, Loss: 18181.544922
## Iteration:   60, Loss: 17354.800781
## Iteration:   70, Loss: 16440.773438
## Iteration:   80, Loss: 15367.431641
## Iteration:   90, Loss: 14006.279297
## Iteration:  100, Loss: 11969.539062
## Iteration:  110, Loss: 14622.074219
## Iteration:  120, Loss: 14481.925781
## Iteration:  130, Loss: 14432.553711
## Iteration:  140, Loss: 14414.109375
## Iteration:  150, Loss: 14406.267578
## Iteration:  160, Loss: 14402.332031
## Iteration:  170, Loss: 14400.175781
## Iteration:  180, Loss: 14398.969727
## Iteration:  190, Loss: 14398.415039
## Iteration:  200, Loss: 14398.177734
## Iteration:  210, Loss: 7290.769531
## Iteration:  220, Loss: 7165.277344
## Iteration:  230, Loss: 7109.806641
## Iteration:  240, Loss: 7076.959961
## Iteration:  250, Loss: 7059.577148
## Iteration:  260, Loss: 7048.008301
## Iteration:  270, Loss: 7038.852539
## Iteration:  280, Loss: 7031.291504
## Iteration:  290, Loss: 7024.563477
## Iteration:  300, Loss: 7018.940430
## Iteration:  310, Loss: 7013.954102
## Iteration:  320, Loss: 7009.539062
## Iteration:  330, Loss: 7005.522949
## Iteration:  340, Loss: 7001.719727
## Iteration:  350, Loss: 6998.311523
## Iteration:  360, Loss: 6995.219727
## Iteration:  370, Loss: 6992.364258
## Iteration:  380, Loss: 6989.705566
## Iteration:  390, Loss: 6987.210449
## Iteration:  400, Loss: 6984.850586
## Iteration:  410, Loss: 6982.660156
## Iteration:  420, Loss: 6980.610840
## Iteration:  430, Loss: 6978.653320
## Iteration:  440, Loss: 6976.790039
## Iteration:  450, Loss: 6975.051758
## Iteration:  460, Loss: 6973.373535
## Iteration:  470, Loss: 6971.781250
## Iteration:  480, Loss: 6970.295410
## Iteration:  490, Loss: 6968.885254
## Iteration:  500, Loss: 6967.526367
## Iteration:  510, Loss: 6966.229492
## Iteration:  520, Loss: 6965.000977
## Iteration:  530, Loss: 6963.808105
## Iteration:  540, Loss: 6962.668945
## Iteration:  550, Loss: 6961.575195
## Iteration:  560, Loss: 6960.505371
## Iteration:  570, Loss: 6959.466309
## Iteration:  580, Loss: 6958.451172
## Iteration:  590, Loss: 6957.499023
## Iteration:  600, Loss: 6956.579102
## Iteration:  610, Loss: 6955.684570
## Iteration:  620, Loss: 6954.833984
## Iteration:  630, Loss: 6954.013672
## Iteration:  640, Loss: 6953.199219
## Iteration:  650, Loss: 6952.405762
## Elapsed time: 1.35s
# visualize markers
features.plot <- c('CD3D', 'MS4A1', 'CD8A', 'GZMK', 'GZMB', 'FCGR3A')
DimPlot(object=pbmc3k.final,reduction="pacmap")

pbmc3k.final <- NormalizeData(pbmc3k.final, verbose = FALSE)
FeaturePlot(pbmc3k.final, features.plot, ncol = 2, reduction="pacmap")

You can also specify dims of your original dataset for running PaCMAP

# run PaCMAP on Seurat object.
pbmc3k.final <- RunPaCMAP(object = pbmc3k.final, dims=2:5)
## X is normalized
## PaCMAP(n_neighbors=10, n_MN=5, n_FP=20, distance=euclidean, lr=1.0, n_iters=(100, 100, 450), apply_pca=True, opt_method='adam', verbose=True, intermediate=False, seed=11)
## Finding pairs
## Found nearest neighbor
## Calculated sigma
## Found scaled dist
## Pairs sampled successfully.
## ((26380, 2), (13190, 2), (52760, 2))
## Initial Loss: 32494.857421875
## Iteration:   10, Loss: 25271.613281
## Iteration:   20, Loss: 21621.359375
## Iteration:   30, Loss: 19508.974609
## Iteration:   40, Loss: 18132.957031
## Iteration:   50, Loss: 17056.433594
## Iteration:   60, Loss: 16103.467773
## Iteration:   70, Loss: 15062.871094
## Iteration:   80, Loss: 13834.863281
## Iteration:   90, Loss: 12237.625000
## Iteration:  100, Loss: 9772.316406
## Iteration:  110, Loss: 12138.644531
## Iteration:  120, Loss: 12073.764648
## Iteration:  130, Loss: 12035.579102
## Iteration:  140, Loss: 12024.872070
## Iteration:  150, Loss: 12020.273438
## Iteration:  160, Loss: 12017.477539
## Iteration:  170, Loss: 12016.250000
## Iteration:  180, Loss: 12015.735352
## Iteration:  190, Loss: 12015.256836
## Iteration:  200, Loss: 12014.985352
## Iteration:  210, Loss: 5415.976562
## Iteration:  220, Loss: 5314.317383
## Iteration:  230, Loss: 5279.950195
## Iteration:  240, Loss: 5264.435059
## Iteration:  250, Loss: 5255.370605
## Iteration:  260, Loss: 5248.233398
## Iteration:  270, Loss: 5243.097168
## Iteration:  280, Loss: 5239.238281
## Iteration:  290, Loss: 5236.099609
## Iteration:  300, Loss: 5233.452637
## Iteration:  310, Loss: 5231.186523
## Iteration:  320, Loss: 5229.175781
## Iteration:  330, Loss: 5227.339355
## Iteration:  340, Loss: 5225.470703
## Iteration:  350, Loss: 5223.921875
## Iteration:  360, Loss: 5222.616699
## Iteration:  370, Loss: 5221.441895
## Iteration:  380, Loss: 5220.406250
## Iteration:  390, Loss: 5219.469727
## Iteration:  400, Loss: 5218.597168
## Iteration:  410, Loss: 5217.724609
## Iteration:  420, Loss: 5216.890625
## Iteration:  430, Loss: 5216.057617
## Iteration:  440, Loss: 5215.260742
## Iteration:  450, Loss: 5214.538574
## Iteration:  460, Loss: 5213.900391
## Iteration:  470, Loss: 5213.275879
## Iteration:  480, Loss: 5212.714844
## Iteration:  490, Loss: 5212.177734
## Iteration:  500, Loss: 5211.688965
## Iteration:  510, Loss: 5211.221680
## Iteration:  520, Loss: 5210.794434
## Iteration:  530, Loss: 5210.372559
## Iteration:  540, Loss: 5209.990723
## Iteration:  550, Loss: 5209.612305
## Iteration:  560, Loss: 5209.245117
## Iteration:  570, Loss: 5208.909180
## Iteration:  580, Loss: 5208.585938
## Iteration:  590, Loss: 5208.267578
## Iteration:  600, Loss: 5207.946777
## Iteration:  610, Loss: 5207.659668
## Iteration:  620, Loss: 5207.363281
## Iteration:  630, Loss: 5207.092285
## Iteration:  640, Loss: 5206.832520
## Iteration:  650, Loss: 5206.579102
## Elapsed time: 1.53s
# visualize markers
features.plot <- c('CD3D', 'MS4A1', 'CD8A', 'GZMK', 'GZMB', 'FCGR3A')
DimPlot(object=pbmc3k.final,reduction="pacmap")

================================================ FILE: docs/pacmap.md ================================================ Running PaCMAP on a Seurat Object ================ Compiled: Nov 20, 2024 This vignette demonstrates how to run PaCMAP, a dimensionality reduction method that can be used for providing robust and trustworthy visualization, on a Seurat object. If you use our work, please cite both papers: > *Understanding How Dimension Reduction Tools Work: An Empirical > Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data > Visualization* > > Yingfan Wang, Haiyang Huang, Cynthia Rudin & Yaron Shaposhnik > > Journal of Machine Learning Research, 2021 > > doi: > > *Towards a comprehensive evaluation of dimension reduction methods for > transcriptomic data visualization* > > Haiyang Huang, Yingfan Wang, Cynthia Rudin and Edward P. Browne > > Communications biology, 2022 > > doi: > > GitHub: Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) - [SeuratData](https://github.com/satijalab/seurat-data) - [Reticulate](https://rstudio.github.io/reticulate/) In addition to R packages, PaCMAP relies on Python to deliver high performance. To streamline the installation process and make environment management easier, we strongly recommend you to use anaconda() or miniconda() for managing Python environments. Below, we provide step-by-step instructions on how to properly install PaCMAP **after** you have installed one of these tools. Create a conda environment with PaCMAP installed: ``` bash conda create -n "pacmap" python=3.12 # Install in the environment called "pacmap" conda activate pacmap conda install -y conda-forge::pacmap ``` To run PaCMAP, you need to connect your R console to the corresponding conda environment. If your Conda/Miniconda installation is located in a non-default directory, you might set up the conda variable as `/path/to/your/conda`. This ensures the correct environment is used during the installation. ``` r reticulate::use_condaenv(condaenv = "pacmap", conda = "auto") ``` ``` r library(Seurat) library(SeuratData) library(SeuratWrappers) ``` ### PaCMAP on PBMC3k To learn more about this dataset, type `?pbmc3k` ``` r InstallData("pbmc3k") pbmc3k.final <- LoadData("pbmc3k",type="pbmc3k.final") # Initial processing to select variable features pbmc3k.final <- UpdateSeuratObject(pbmc3k.final) pbmc3k.final <- FindVariableFeatures(pbmc3k.final) # run PaCMAP on Seurat object. pbmc3k.final <- RunPaCMAP(object = pbmc3k.final, features=VariableFeatures(pbmc3k.final)) ``` ## Applied PCA, the dimensionality becomes 100 ## PaCMAP(n_neighbors=10, n_MN=5, n_FP=20, distance=euclidean, lr=1.0, n_iters=(100, 100, 450), apply_pca=True, opt_method='adam', verbose=True, intermediate=False, seed=11) ## Finding pairs ## Found nearest neighbor ## Calculated sigma ## Found scaled dist ## Pairs sampled successfully. ## ((26380, 2), (13190, 2), (52760, 2)) ## Initial Loss: 32494.857421875 ## Iteration: 10, Loss: 25802.580078 ## Iteration: 20, Loss: 21603.363281 ## Iteration: 30, Loss: 19970.650391 ## Iteration: 40, Loss: 18992.988281 ## Iteration: 50, Loss: 18181.544922 ## Iteration: 60, Loss: 17354.800781 ## Iteration: 70, Loss: 16440.773438 ## Iteration: 80, Loss: 15367.431641 ## Iteration: 90, Loss: 14006.279297 ## Iteration: 100, Loss: 11969.539062 ## Iteration: 110, Loss: 14622.074219 ## Iteration: 120, Loss: 14481.925781 ## Iteration: 130, Loss: 14432.553711 ## Iteration: 140, Loss: 14414.109375 ## Iteration: 150, Loss: 14406.267578 ## Iteration: 160, Loss: 14402.332031 ## Iteration: 170, Loss: 14400.175781 ## Iteration: 180, Loss: 14398.969727 ## Iteration: 190, Loss: 14398.415039 ## Iteration: 200, Loss: 14398.177734 ## Iteration: 210, Loss: 7290.769531 ## Iteration: 220, Loss: 7165.277344 ## Iteration: 230, Loss: 7109.806641 ## Iteration: 240, Loss: 7076.959961 ## Iteration: 250, Loss: 7059.577148 ## Iteration: 260, Loss: 7048.008301 ## Iteration: 270, Loss: 7038.852539 ## Iteration: 280, Loss: 7031.291504 ## Iteration: 290, Loss: 7024.563477 ## Iteration: 300, Loss: 7018.940430 ## Iteration: 310, Loss: 7013.954102 ## Iteration: 320, Loss: 7009.539062 ## Iteration: 330, Loss: 7005.522949 ## Iteration: 340, Loss: 7001.719727 ## Iteration: 350, Loss: 6998.311523 ## Iteration: 360, Loss: 6995.219727 ## Iteration: 370, Loss: 6992.364258 ## Iteration: 380, Loss: 6989.705566 ## Iteration: 390, Loss: 6987.210449 ## Iteration: 400, Loss: 6984.850586 ## Iteration: 410, Loss: 6982.660156 ## Iteration: 420, Loss: 6980.610840 ## Iteration: 430, Loss: 6978.653320 ## Iteration: 440, Loss: 6976.790039 ## Iteration: 450, Loss: 6975.051758 ## Iteration: 460, Loss: 6973.373535 ## Iteration: 470, Loss: 6971.781250 ## Iteration: 480, Loss: 6970.295410 ## Iteration: 490, Loss: 6968.885254 ## Iteration: 500, Loss: 6967.526367 ## Iteration: 510, Loss: 6966.229492 ## Iteration: 520, Loss: 6965.000977 ## Iteration: 530, Loss: 6963.808105 ## Iteration: 540, Loss: 6962.668945 ## Iteration: 550, Loss: 6961.575195 ## Iteration: 560, Loss: 6960.505371 ## Iteration: 570, Loss: 6959.466309 ## Iteration: 580, Loss: 6958.451172 ## Iteration: 590, Loss: 6957.499023 ## Iteration: 600, Loss: 6956.579102 ## Iteration: 610, Loss: 6955.684570 ## Iteration: 620, Loss: 6954.833984 ## Iteration: 630, Loss: 6954.013672 ## Iteration: 640, Loss: 6953.199219 ## Iteration: 650, Loss: 6952.405762 ## Elapsed time: 1.11s ``` r # visualize markers features.plot <- c('CD3D', 'MS4A1', 'CD8A', 'GZMK', 'GZMB', 'FCGR3A') DimPlot(object=pbmc3k.final,reduction="pacmap") ``` ![](pacmap_files/figure-gfm/explore-1.png) ``` r pbmc3k.final <- NormalizeData(pbmc3k.final, verbose = FALSE) FeaturePlot(pbmc3k.final, features.plot, ncol = 2, reduction="pacmap") ``` ![](pacmap_files/figure-gfm/explore2-1.png) You can also specify dims of your original dataset for running PaCMAP ``` r # run PaCMAP on Seurat object. pbmc3k.final <- RunPaCMAP(object = pbmc3k.final, dims=2:5) ``` ## X is normalized ## PaCMAP(n_neighbors=10, n_MN=5, n_FP=20, distance=euclidean, lr=1.0, n_iters=(100, 100, 450), apply_pca=True, opt_method='adam', verbose=True, intermediate=False, seed=11) ## Finding pairs ## Found nearest neighbor ## Calculated sigma ## Found scaled dist ## Pairs sampled successfully. ## ((26380, 2), (13190, 2), (52760, 2)) ## Initial Loss: 32494.857421875 ## Iteration: 10, Loss: 25271.613281 ## Iteration: 20, Loss: 21621.359375 ## Iteration: 30, Loss: 19508.974609 ## Iteration: 40, Loss: 18132.957031 ## Iteration: 50, Loss: 17056.433594 ## Iteration: 60, Loss: 16103.467773 ## Iteration: 70, Loss: 15062.871094 ## Iteration: 80, Loss: 13834.863281 ## Iteration: 90, Loss: 12237.625000 ## Iteration: 100, Loss: 9772.316406 ## Iteration: 110, Loss: 12138.644531 ## Iteration: 120, Loss: 12073.764648 ## Iteration: 130, Loss: 12035.579102 ## Iteration: 140, Loss: 12024.872070 ## Iteration: 150, Loss: 12020.273438 ## Iteration: 160, Loss: 12017.477539 ## Iteration: 170, Loss: 12016.250000 ## Iteration: 180, Loss: 12015.735352 ## Iteration: 190, Loss: 12015.256836 ## Iteration: 200, Loss: 12014.985352 ## Iteration: 210, Loss: 5415.976562 ## Iteration: 220, Loss: 5314.317383 ## Iteration: 230, Loss: 5279.950195 ## Iteration: 240, Loss: 5264.435059 ## Iteration: 250, Loss: 5255.370605 ## Iteration: 260, Loss: 5248.233398 ## Iteration: 270, Loss: 5243.097168 ## Iteration: 280, Loss: 5239.238281 ## Iteration: 290, Loss: 5236.099609 ## Iteration: 300, Loss: 5233.452637 ## Iteration: 310, Loss: 5231.186523 ## Iteration: 320, Loss: 5229.175781 ## Iteration: 330, Loss: 5227.339355 ## Iteration: 340, Loss: 5225.470703 ## Iteration: 350, Loss: 5223.921875 ## Iteration: 360, Loss: 5222.616699 ## Iteration: 370, Loss: 5221.441895 ## Iteration: 380, Loss: 5220.406250 ## Iteration: 390, Loss: 5219.469727 ## Iteration: 400, Loss: 5218.597168 ## Iteration: 410, Loss: 5217.724609 ## Iteration: 420, Loss: 5216.890625 ## Iteration: 430, Loss: 5216.057617 ## Iteration: 440, Loss: 5215.260742 ## Iteration: 450, Loss: 5214.538574 ## Iteration: 460, Loss: 5213.900391 ## Iteration: 470, Loss: 5213.275879 ## Iteration: 480, Loss: 5212.714844 ## Iteration: 490, Loss: 5212.177734 ## Iteration: 500, Loss: 5211.688965 ## Iteration: 510, Loss: 5211.221680 ## Iteration: 520, Loss: 5210.794434 ## Iteration: 530, Loss: 5210.372559 ## Iteration: 540, Loss: 5209.990723 ## Iteration: 550, Loss: 5209.612305 ## Iteration: 560, Loss: 5209.245117 ## Iteration: 570, Loss: 5208.909180 ## Iteration: 580, Loss: 5208.585938 ## Iteration: 590, Loss: 5208.267578 ## Iteration: 600, Loss: 5207.946777 ## Iteration: 610, Loss: 5207.659668 ## Iteration: 620, Loss: 5207.363281 ## Iteration: 630, Loss: 5207.092285 ## Iteration: 640, Loss: 5206.832520 ## Iteration: 650, Loss: 5206.579102 ## Elapsed time: 1.08s ``` r # visualize markers features.plot <- c('CD3D', 'MS4A1', 'CD8A', 'GZMK', 'GZMB', 'FCGR3A') DimPlot(object=pbmc3k.final,reduction="pacmap") ``` ![](pacmap_files/figure-gfm/pacmap_dim-1.png) ================================================ FILE: docs/presto.html ================================================

Fast Differential Expression with Presto

Compiled: October 07, 2020

This vignette demonstrates the use of the Presto package in Seurat. Commands and parameters are based off of the Presto tutorial. If you use Presto in your work, please cite:

Presto scales Wilcoxon and auROC analyses to millions of observations

Ilya Korsunsky, Aparna Nathan, Nghia Millard, Soumya Raychaudhuri

bioRxiv, 2019.

Pre-print: https://www.biorxiv.org/content/10.1101/653253v1.full.pdf

GitHub: https://github.com/immunogenomics/presto

Prerequisites to install:

Differential Expression Testing for PBMC scRNA-seq Data

To learn more about this dataset, type ?pbmc3k

##                p_val avg_logFC pct.1 pct.2     p_val_adj
## CD79A  1.660326e-143 -2.989854 0.042 0.936 2.276972e-139
## TYROBP 3.516407e-138  3.512505 0.994 0.102 4.822401e-134
## S100A9 7.003189e-137  4.293303 0.996 0.134 9.604174e-133
## CST3   1.498348e-135  3.344758 0.992 0.174 2.054834e-131
## S100A4 8.872946e-135  2.854897 1.000 0.360 1.216836e-130
## LYZ    2.720838e-134  3.788514 1.000 0.422 3.731357e-130
## S100A8 3.115452e-133  4.039777 0.975 0.076 4.272530e-129
## CD79B  8.317731e-133 -2.667534 0.083 0.916 1.140694e-128
## S100A6 5.156920e-132  2.541609 0.996 0.352 7.072201e-128
## LGALS1 1.427548e-131  3.002493 0.979 0.131 1.957739e-127
##                     p_val avg_logFC pct.1 pct.2     p_val_adj cluster      gene
## CD79A.3      0.000000e+00  2.933865 0.936 0.044  0.000000e+00       B     CD79A
## MS4A1.3      0.000000e+00  2.290577 0.855 0.055  0.000000e+00       B     MS4A1
## LINC00926.1 2.998236e-274  1.956493 0.564 0.010 4.111781e-270       B LINC00926
## CD79B.3     1.126919e-273  2.381160 0.916 0.144 1.545457e-269       B     CD79B
## TCL1A.3     1.962618e-272  2.463556 0.622 0.023 2.691534e-268       B     TCL1A
## HLA-DQA1.2  3.017803e-267  2.104207 0.890 0.119 4.138616e-263       B  HLA-DQA1
## VPREB3      2.131575e-238  1.667466 0.488 0.008 2.923242e-234       B    VPREB3
## HLA-DQB1.2  2.076231e-230  2.112052 0.863 0.148 2.847343e-226       B  HLA-DQB1
## CD74.2      1.000691e-184  2.010688 1.000 0.819 1.372347e-180       B      CD74
## HLA-DRA.3   1.813356e-184  1.914531 1.000 0.492 2.486837e-180       B   HLA-DRA
================================================ FILE: docs/presto.md ================================================ Fast Differential Expression with Presto ================ Compiled: October 07, 2020 This vignette demonstrates the use of the Presto package in Seurat. Commands and parameters are based off of the [Presto tutorial](http://htmlpreview.github.io/?https://github.com/immunogenomics/presto/blob/master/docs/getting-started.html). If you use Presto in your work, please cite: > *Presto scales Wilcoxon and auROC analyses to millions of > observations* > > Ilya Korsunsky, Aparna Nathan, Nghia Millard, Soumya Raychaudhuri > > bioRxiv, 2019. > > Pre-print: > > GitHub: Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [Presto](https://github.com/immunogenomics/presto) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) - [SeuratData](https://github.com/satijalab/seurat-data) ``` r library(presto) library(Seurat) library(SeuratData) library(SeuratWrappers) ``` ### Differential Expression Testing for PBMC scRNA-seq Data To learn more about this dataset, type `?pbmc3k` ``` r InstallData("pbmc3k") data("pbmc3k") pbmc3k <- NormalizeData(pbmc3k) Idents(pbmc3k) <- "seurat_annotations" diffexp.B.Mono <- RunPresto(pbmc3k, "CD14+ Mono", "B") head(diffexp.B.Mono, 10) ``` ## p_val avg_logFC pct.1 pct.2 p_val_adj ## CD79A 1.660326e-143 -2.989854 0.042 0.936 2.276972e-139 ## TYROBP 3.516407e-138 3.512505 0.994 0.102 4.822401e-134 ## S100A9 7.003189e-137 4.293303 0.996 0.134 9.604174e-133 ## CST3 1.498348e-135 3.344758 0.992 0.174 2.054834e-131 ## S100A4 8.872946e-135 2.854897 1.000 0.360 1.216836e-130 ## LYZ 2.720838e-134 3.788514 1.000 0.422 3.731357e-130 ## S100A8 3.115452e-133 4.039777 0.975 0.076 4.272530e-129 ## CD79B 8.317731e-133 -2.667534 0.083 0.916 1.140694e-128 ## S100A6 5.156920e-132 2.541609 0.996 0.352 7.072201e-128 ## LGALS1 1.427548e-131 3.002493 0.979 0.131 1.957739e-127 ``` r diffexp.all <- RunPrestoAll(pbmc3k) head(diffexp.all[diffexp.all$cluster == "B", ], 10) ``` ## p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene ## CD79A.3 0.000000e+00 2.933865 0.936 0.044 0.000000e+00 B CD79A ## MS4A1.3 0.000000e+00 2.290577 0.855 0.055 0.000000e+00 B MS4A1 ## LINC00926.1 2.998236e-274 1.956493 0.564 0.010 4.111781e-270 B LINC00926 ## CD79B.3 1.126919e-273 2.381160 0.916 0.144 1.545457e-269 B CD79B ## TCL1A.3 1.962618e-272 2.463556 0.622 0.023 2.691534e-268 B TCL1A ## HLA-DQA1.2 3.017803e-267 2.104207 0.890 0.119 4.138616e-263 B HLA-DQA1 ## VPREB3 2.131575e-238 1.667466 0.488 0.008 2.923242e-234 B VPREB3 ## HLA-DQB1.2 2.076231e-230 2.112052 0.863 0.148 2.847343e-226 B HLA-DQB1 ## CD74.2 1.000691e-184 2.010688 1.000 0.819 1.372347e-180 B CD74 ## HLA-DRA.3 1.813356e-184 1.914531 1.000 0.492 2.486837e-180 B HLA-DRA ================================================ FILE: docs/presto.rmd ================================================ --- title: "Fast Differential Expression with Presto" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: github_document: html_preview: true toc: true toc_depth: 3 fig_width: 16 html_document: df_print: kable theme: united fig_height: 5 fig_width: 16 out_height: 4 --- This vignette demonstrates the use of the Presto package in Seurat. Commands and parameters are based off of the [Presto tutorial](http://htmlpreview.github.io/?https://github.com/immunogenomics/presto/blob/master/docs/getting-started.html). If you use Presto in your work, please cite: > *Presto scales Wilcoxon and auROC analyses to millions of observations* > > Ilya Korsunsky, Aparna Nathan, Nghia Millard, Soumya Raychaudhuri > > bioRxiv, 2019. > > Pre-print: https://www.biorxiv.org/content/10.1101/653253v1.full.pdf > > GitHub: https://github.com/immunogenomics/presto ```{r setup, include=FALSE} if (!requireNamespace("presto")) { remotes::install_github("immunogenomics/presto", upgrade = FALSE) } knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE ) ``` Prerequisites to install: * [Seurat](https://satijalab.org/seurat/install) * [Presto](https://github.com/immunogenomics/presto) * [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) * [SeuratData](https://github.com/satijalab/seurat-data) ```{r packages} library(presto) library(Seurat) library(SeuratData) library(SeuratWrappers) ``` ### Differential Expression Testing for PBMC scRNA-seq Data To learn more about this dataset, type `?pbmc3k` ```{r pbmc3k, cache=TRUE} InstallData("pbmc3k") data("pbmc3k") pbmc3k <- NormalizeData(pbmc3k) Idents(pbmc3k) <- 'seurat_annotations' diffexp.B.Mono <- RunPresto(pbmc3k, 'CD14+ Mono', 'B') head(diffexp.B.Mono, 10) diffexp.all <- RunPrestoAll(pbmc3k) head(diffexp.all[diffexp.all$cluster=='B', ], 10) ``` ================================================ FILE: docs/schex.Rmd ================================================ --- title: "Using schex with Seurat" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: github_document: html_preview: true toc: false html_document: df_print: kable theme: united --- This vigettte demonstrates how to run schex on Seurat objects, which aims to provide better plots. If you use schex, please cite: > *Single cell transcriptomics reveals spatial and temporal dynamics > of gene expression in the developing mouse spinal cord* > > Delile, Julien, Teresa Rayon, Manuela Melchionda, Amelia Edwards, James > Briscoe, and Andreas Sagner. > > doi: [0.1242/dev.173807](https://doi.org/0.1242/dev.173807) > > Github: https://github.com/SaskiaFreytag/schex ```{r, include = FALSE} knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, fig.height = 10, fig.width = 16 ) ``` Reduced dimension plotting is one of the essential tools for the analysis of single cell data. However, as the number of cells/nuclei in these these plots increases, the usefulness of these plots decreases. Many cells are plotted on top of each other obscuring information, even when taking advantage of transparency settings. This package provides binning strategies of cells/nuclei into hexagon cells. Plotting summarized information of all cells/nuclei in their respective hexagon cells presents information without obstructions. The package seemlessly works with the two most common object classes for the storage of single cell data; `SingleCellExperiment` from the [SingleCellExperiment](https://bioconductor.org/packages/3.9/bioc/html/SingleCellExperiment.html) package and `Seurat` from the [Seurat](https://satijalab.org/seurat/) package. In this vignette I will be presenting the use of `schex` for `Seurat` objects. ## Load libraries Prerequisites to install that are not available via `install.packages`: * [schex](https://github.com/SaskiaFreytag/schex) * [SeuratData](https://github.com/satijalab/seurat-data) ```{r load-libraries, message=FALSE, warning=FALSE} library(Seurat) library(SeuratData) library(ggplot2) library(ggrepel) library(dplyr) theme_set(theme_classic()) library(schex) ``` ## Setup single cell data In order to demonstrate the capabilities of the schex package, I will use the a dataset of Peripheral Blood Mononuclear Cells (PBMC) freely available from 10x Genomics. There are 2,700 single cells that were sequenced on the Illumina NextSeq 500. You can download the data from the [Seurat website](https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz). ```{r load} InstallData("pbmc3k") pbmc <- pbmc3k ``` In the next section, I will perform some simple quality control steps outlined in the [Seurat vignette](https://satijalab.org/seurat/v3.0/pbmc3k_tutorial.html). I will then calculate various dimension reductions and cluster the data, as also outlined in the vignette. ## Standard pre-processing workflow ```{r preprocess} pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") pbmc %>% subset(subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5) %>% NormalizeData() %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE) %>% RunUMAP(dims = 1:10) %>% FindNeighbors(dims = 1:10) %>% FindClusters(resolution = 0.5, verbose = FALSE) -> pbmc ``` ## Plotting single cell data At this stage in the workflow we usually would like to plot aspects of our data in one of the reduced dimension representations. Instead of plotting this in an ordinary fashion, I will demonstrate how schex can provide a better way of plotting this. #### Calculate hexagon cell representation First, I will calculate the hexagon cell representation for each cell for a specified dimension reduction representation. I decide to use `nbins=40` which specifies that I divide my x range into 40 bins. Note that this might be a parameter that you want to play around with depending on the number of cells/ nuclei in your dataset. Generally, for more cells/nuclei, `nbins` should be increased. ```{r calc-hexbin} pbmc <- make_hexbin(pbmc, nbins = 40, dimension_reduction = "UMAP") ``` #### Plot number of cells/nuclei in each hexagon cell First I plot how many cells are in each hexagon cell. This should be relatively even, otherwise change the `nbins` parameter in the previous calculation. ```{r plot-density} plot_hexbin_density(pbmc) ``` #### Plot meta data in hexagon cell representation Next I colour the hexagon cells by some meta information, such as the median total count or cluster membership in each hexagon cell. ```{r plot-meta-1} plot_hexbin_meta(pbmc, col="nCount_RNA", action="median") ``` ```{r plot-meta-2} plot_hexbin_meta(pbmc, col="RNA_snn_res.0.5", action="majority") ``` For convenience there is also a function that allows the calculation of label positions for factor variables. These can be overlayed with the package `ggrepel`. ```{r plot-meta-label} label_df <- make_hexbin_label(pbmc, col="RNA_snn_res.0.5") pp <- plot_hexbin_meta(pbmc, col="RNA_snn_res.0.5", action="majority") pp + ggrepel::geom_label_repel(data = label_df, aes(x=x, y=y, label = label), colour="black", label.size = NA, fill = NA) ``` #### Plot gene expression in hexagon cell representation Finally, I will visualize the gene expression of the CD19 gene in the hexagon cell representation. ```{r plot-gene} gene_id <-"CD19" plot_hexbin_gene(pbmc, type="logcounts", gene=gene_id, action="mean", xlab="UMAP1", ylab="UMAP2", title=paste0("Mean of ", gene_id)) ``` ================================================ FILE: docs/schex.html ================================================ Using schex with Seurat

This vigettte demonstrates how to run schex on Seurat objects, which aims to provide better plots. If you use schex, please cite:

Single cell transcriptomics reveals spatial and temporal dynamics of gene expression in the developing mouse spinal cord

Delile, Julien, Teresa Rayon, Manuela Melchionda, Amelia Edwards, James Briscoe, and Andreas Sagner.

doi: 0.1242/dev.173807

Github: https://github.com/SaskiaFreytag/schex

Reduced dimension plotting is one of the essential tools for the analysis of single cell data. However, as the number of cells/nuclei in these these plots increases, the usefulness of these plots decreases. Many cells are plotted on top of each other obscuring information, even when taking advantage of transparency settings. This package provides binning strategies of cells/nuclei into hexagon cells. Plotting summarized information of all cells/nuclei in their respective hexagon cells presents information without obstructions. The package seemlessly works with the two most common object classes for the storage of single cell data; SingleCellExperiment from the SingleCellExperiment package and Seurat from the Seurat package. In this vignette I will be presenting the use of schex for Seurat objects.

Load libraries

Prerequisites to install that are not available via install.packages:

library(Seurat)
library(SeuratData)
library(ggplot2)
library(ggrepel)
library(dplyr)
theme_set(theme_classic())
library(schex)

Setup single cell data

In order to demonstrate the capabilities of the schex package, I will use the a dataset of Peripheral Blood Mononuclear Cells (PBMC) freely available from 10x Genomics. There are 2,700 single cells that were sequenced on the Illumina NextSeq 500. You can download the data from the Seurat website.

InstallData("pbmc3k")
pbmc <- pbmc3k

In the next section, I will perform some simple quality control steps outlined in the Seurat vignette. I will then calculate various dimension reductions and cluster the data, as also outlined in the vignette.

Standard pre-processing workflow

pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
pbmc <- pbmc %>% subset(subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5) %>% 
    NormalizeData() %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE) %>% RunUMAP(dims = 1:10) %>% 
    FindNeighbors(dims = 1:10) %>% FindClusters(resolution = 0.5, verbose = FALSE)

Plotting single cell data

At this stage in the workflow we usually would like to plot aspects of our data in one of the reduced dimension representations. Instead of plotting this in an ordinary fashion, I will demonstrate how schex can provide a better way of plotting this.

Calculate hexagon cell representation

First, I will calculate the hexagon cell representation for each cell for a specified dimension reduction representation. I decide to use nbins=40 which specifies that I divide my x range into 40 bins. Note that this might be a parameter that you want to play around with depending on the number of cells/ nuclei in your dataset. Generally, for more cells/nuclei, nbins should be increased.

pbmc <- make_hexbin(pbmc, nbins = 40, dimension_reduction = "UMAP")

Plot number of cells/nuclei in each hexagon cell

First I plot how many cells are in each hexagon cell. This should be relatively even, otherwise change the nbins parameter in the previous calculation.

plot_hexbin_density(pbmc)

Plot meta data in hexagon cell representation

Next I colour the hexagon cells by some meta information, such as the median total count or cluster membership in each hexagon cell.

plot_hexbin_meta(pbmc, col = "nCount_RNA", action = "median")

plot_hexbin_meta(pbmc, col = "RNA_snn_res.0.5", action = "majority")

For convenience there is also a function that allows the calculation of label positions for factor variables. These can be overlayed with the package ggrepel.

label_df <- make_hexbin_label(pbmc, col = "RNA_snn_res.0.5")
pp <- plot_hexbin_meta(pbmc, col = "RNA_snn_res.0.5", action = "majority")
pp + ggrepel::geom_label_repel(data = label_df, aes(x = x, y = y, label = label), colour = "black", 
    label.size = NA, fill = NA)

Plot gene expression in hexagon cell representation

Finally, I will visualize the gene expression of the CD19 gene in the hexagon cell representation.

gene_id <- "CD19"
plot_hexbin_gene(pbmc, type = "logcounts", gene = gene_id, action = "mean", xlab = "UMAP1", ylab = "UMAP2", 
    title = paste0("Mean of ", gene_id))

================================================ FILE: docs/schex.md ================================================ Using schex with Seurat ================ Compiled: August 07, 2019 This vigettte demonstrates how to run schex on Seurat objects, which aims to provide better plots. If you use schex, please cite: > *Single cell transcriptomics reveals spatial and temporal dynamics of gene expression in the developing mouse spinal cord* > > Delile, Julien, Teresa Rayon, Manuela Melchionda, Amelia Edwards, James Briscoe, and Andreas Sagner. > > doi: [0.1242/dev.173807](https://doi.org/0.1242/dev.173807) > > Github: Reduced dimension plotting is one of the essential tools for the analysis of single cell data. However, as the number of cells/nuclei in these these plots increases, the usefulness of these plots decreases. Many cells are plotted on top of each other obscuring information, even when taking advantage of transparency settings. This package provides binning strategies of cells/nuclei into hexagon cells. Plotting summarized information of all cells/nuclei in their respective hexagon cells presents information without obstructions. The package seemlessly works with the two most common object classes for the storage of single cell data; `SingleCellExperiment` from the [SingleCellExperiment](https://bioconductor.org/packages/3.9/bioc/html/SingleCellExperiment.html) package and `Seurat` from the [Seurat](https://satijalab.org/seurat/) package. In this vignette I will be presenting the use of `schex` for `Seurat` objects. Load libraries -------------- Prerequisites to install that are not available via `install.packages`: - [schex](https://github.com/SaskiaFreytag/schex) - [SeuratData](https://github.com/satijalab/seurat-data) ``` r library(Seurat) library(SeuratData) library(ggplot2) library(ggrepel) theme_set(theme_classic()) library(schex) ``` Setup single cell data ---------------------- In order to demonstrate the capabilities of the schex package, I will use the a dataset of Peripheral Blood Mononuclear Cells (PBMC) freely available from 10x Genomics. There are 2,700 single cells that were sequenced on the Illumina NextSeq 500. You can download the data from the [Seurat website](https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz). ``` r InstallData("pbmc3k") pbmc <- pbmc3k ``` In the next few sections, I will perform some simple quality control steps outlined in the [Seurat vignette](https://satijalab.org/seurat/v3.0/pbmc3k_tutorial.html). I will then calculate various dimension reductions and cluster the data, as also outlined in the vignette. Standard pre-processing workflow -------------------------------- ### Filtering Cells with high mitochondrial content as well as cells with too low or too high feature count are filtered. ``` r pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5) ``` ### Normalization Next a global-scaling normalization method is employed to normalizes the feature expression measurements for each cell. ``` r pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000, verbose = FALSE) ``` ### Identification of highly variable genes Many of the downstream methods are based on only the highly variable genes, hence we require their identification. ``` r pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000, verbose = FALSE) ``` ### Scaling Prior to dimension reduction the data is scaled. ``` r all.genes <- rownames(pbmc) pbmc <- ScaleData(pbmc, features = all.genes, verbose = FALSE) ``` ### Perform dimensionality reductions First a PCA is applied to the data. Using the PCA you will have to decide on the dimensionality of the data. Here the dimensionality was decided to be 10. Please refer to the original Seurat vignette for methods on how this is assessed. ``` r pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc), verbose = FALSE) ``` Next a UMAP dimensionality reduction is also run. ``` r pbmc <- RunUMAP(pbmc, dims = 1:10, verbose = FALSE) ``` ### Clustering Finally the data is clustered. ``` r pbmc <- FindNeighbors(pbmc, dims = 1:10, verbose = FALSE) pbmc <- FindClusters(pbmc, resolution = 0.5, verbose = FALSE) ``` Plotting single cell data ------------------------- At this stage in the workflow we usually would like to plot aspects of our data in one of the reduced dimension representations. Instead of plotting this in an ordinary fashion, I will demonstrate how schex can provide a better way of plotting this. #### Calculate hexagon cell representation First, I will calculate the hexagon cell representation for each cell for a specified dimension reduction representation. I decide to use `nbins=40` which specifies that I divide my x range into 40 bins. Note that this might be a parameter that you want to play around with depending on the number of cells/ nuclei in your dataset. Generally, for more cells/nuclei, `nbins` should be increased. ``` r pbmc <- make_hexbin(pbmc, nbins = 40, dimension_reduction = "UMAP") ``` #### Plot number of cells/nuclei in each hexagon cell First I plot how many cells are in each hexagon cell. This should be relatively even, otherwise change the `nbins` parameter in the previous calculation. ``` r plot_hexbin_density(pbmc) ``` ![](schex_files/figure-markdown_github/plot-density-1.png) #### Plot meta data in hexagon cell representation Next I colour the hexagon cells by some meta information, such as the median total count or cluster membership in each hexagon cell. ``` r plot_hexbin_meta(pbmc, col = "nCount_RNA", action = "median") ``` ![](schex_files/figure-markdown_github/plot-meta-1-1.png) ``` r plot_hexbin_meta(pbmc, col = "RNA_snn_res.0.5", action = "majority") ``` ![](schex_files/figure-markdown_github/plot-meta-2-1.png) For convenience there is also a function that allows the calculation of label positions for factor variables. These can be overlayed with the package `ggrepel`. ``` r label_df <- make_hexbin_label(pbmc, col = "RNA_snn_res.0.5") pp <- plot_hexbin_meta(pbmc, col = "RNA_snn_res.0.5", action = "majority") pp + ggrepel::geom_label_repel(data = label_df, aes(x = x, y = y, label = label), colour = "black", label.size = NA, fill = NA) ``` ![](schex_files/figure-markdown_github/plot-meta-label-1.png) #### Plot gene expression in hexagon cell representation Finally, I will visualize the gene expression of the CD19 gene in the hexagon cell representation. ``` r gene_id <- "CD19" plot_hexbin_gene(pbmc, type = "logcounts", gene = gene_id, action = "mean", xlab = "UMAP1", ylab = "UMAP2", title = paste0("Mean of ", gene_id)) ``` ![](schex_files/figure-markdown_github/plot-gene-1.png) ================================================ FILE: docs/scvelo.Rmd ================================================ --- title: "Estimating RNA Velocity using Seurat and scVelo" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: html_document: df_print: kable theme: united github_document: html_preview: true toc: false --- This vignette demonstrates analysing RNA Velocity quantifications stored in a Seurat object using scVelo. If you use scVelo in your work, please cite: > *Generalizing RNA velocity to transient cell states through dynamical modeling* > > Volker Bergen, Marius Lange, Stefan Peidli, F. Alexander Wolf & Fabian J. Theis > > doi: [10.1101/820936](https://doi.org/10.1101/820936) > > Website: https://scvelo.readthedocs.io/ ```{r setup, include=FALSE} python3 <- Sys.which(names = c("python3.6", "python3")) python3 <- unname(obj = Filter(f = nchar, x = python3))[1] library(reticulate) reticulate::use_python(python = python3, required = TRUE) knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, fig.height = 10, fig.width = 16 ) ``` ```{r install_deps, echo=FALSE} if (!requireNamespace("SeuratDisk", quietly = TRUE)) { remotes::install_github(repo = "mojaveazure/seurat-disk", upgrade = FALSE) } if (!py_module_available(module = "scvelo")) { pip3 <- Sys.which(names = "pip3")[1] if (!nchar(x = pip3)) { stop("Cannot find pip3 or scvelo", call. = FALSE) } system2(command = pip3, args = c("install", "scvelo")) } ``` Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [scVelo](https://scvelo.readthedocs.io/installation.html) - [SeuratDisk](https://mojaveazure.github.io/seurat-disk/#installation) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) ```{r packages} library(Seurat) library(SeuratDisk) library(SeuratWrappers) ``` ```{r cleanup, results="hide", echo=FALSE, eval=TRUE} if (file.exists("mouseBM.h5Seurat")) { file.remove("mouseBM.h5Seurat") } if (file.exists("mouseBM.h5ad")) { file.remove("mouseBM.h5ad") } ``` ```{r download, results="hide", echo=FALSE, eval=TRUE} dir.create("~/Downloads", showWarnings = FALSE, recursive = TRUE) curl::curl_download( url = "http://pklab.med.harvard.edu/velocyto/mouseBM/SCG71.loom", destfile = "~/Downloads/SCG71.loom" ) ``` ```{r load_data3, results='hide', eval=TRUE} # If you don't have velocyto's example mouse bone marrow dataset, download with the CURL command # curl::curl_download(url = "http://pklab.med.harvard.edu/velocyto/mouseBM/SCG71.loom", destfile = "~/Downloads/SCG71.loom") ldat <- ReadVelocity(file = "~/Downloads/SCG71.loom") bm <- as.Seurat(x = ldat) bm[["RNA"]] <- bm[["spliced"]] bm <- SCTransform(bm) bm <- RunPCA(bm) bm <- RunUMAP(bm, dims = 1:20) bm <- FindNeighbors(bm, dims = 1:20) bm <- FindClusters(bm) DefaultAssay(bm) <- "RNA" SaveH5Seurat(bm, filename = "mouseBM.h5Seurat") Convert("mouseBM.h5Seurat", dest = "h5ad") ``` ```{python load_adata, eval=TRUE} # In Python import scvelo as scv adata = scv.read("mouseBM.h5ad") adata ``` ```{python scvelo, results="hide", eval=TRUE} scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000) scv.pp.moments(adata, n_pcs=30, n_neighbors=30) scv.tl.velocity(adata) scv.tl.velocity_graph(adata) scv.pl.velocity_embedding_stream(adata, basis="umap", color="seurat_clusters") scv.pl.velocity_embedding(adata, basis="umap", color="seurat_clusters", arrow_length=3, arrow_size=2, dpi=120) ``` ```{python latent_time, results="hide", eval=TRUE} scv.tl.recover_dynamics(adata) scv.tl.latent_time(adata) scv.pl.scatter(adata, color="latent_time", color_map="gnuplot") top_genes = adata.var["fit_likelihood"].sort_values(ascending=False).index[:300] scv.pl.heatmap(adata, var_names=top_genes, sortby="latent_time", col_color="seurat_clusters", n_convolve=100) ``` ================================================ FILE: docs/scvelo.html ================================================ Estimating RNA Velocity using Seurat and scVelo

This vignette demonstrates analysing RNA Velocity quantifications stored in a Seurat object using scVelo. If you use scVelo in your work, please cite:

Generalizing RNA velocity to transient cell states through dynamical modeling

Volker Bergen, Marius Lange, Stefan Peidli, F. Alexander Wolf & Fabian J. Theis

doi: 10.1101/820936

Website: https://scvelo.readthedocs.io/

Prerequisites to install:

library(Seurat)
library(SeuratDisk)
library(SeuratWrappers)
# If you don't have velocyto's example mouse bone marrow dataset, download with the CURL command
# curl::curl_download(url = 'http://pklab.med.harvard.edu/velocyto/mouseBM/SCG71.loom', destfile
# = '~/Downloads/SCG71.loom')
ldat <- ReadVelocity(file = "~/Downloads/SCG71.loom")
bm <- as.Seurat(x = ldat)
bm[["RNA"]] <- bm[["spliced"]]
bm <- SCTransform(bm)
bm <- RunPCA(bm)
bm <- RunUMAP(bm, dims = 1:20)
bm <- FindNeighbors(bm, dims = 1:20)
bm <- FindClusters(bm)
DefaultAssay(bm) <- "RNA"
SaveH5Seurat(bm, filename = "mouseBM.h5Seurat")
Convert("mouseBM.h5Seurat", dest = "h5ad")
# In Python
import scvelo as scv
adata = scv.read("mouseBM.h5ad")
adata
## AnnData object with n_obs × n_vars = 6667 × 24421
##     obs: 'orig.ident', 'nCount_spliced', 'nFeature_spliced', 'nCount_unspliced', 'nFeature_unspliced', 'nCount_ambiguous', 'nFeature_ambiguous', 'nCount_RNA', 'nFeature_RNA', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.8', 'seurat_clusters'
##     var: 'features', 'ambiguous_features', 'spliced_features', 'unspliced_features'
##     obsm: 'X_umap'
##     layers: 'ambiguous', 'spliced', 'unspliced'
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000)
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
scv.tl.velocity(adata)
scv.tl.velocity_graph(adata)
scv.pl.velocity_embedding_stream(adata, basis="umap", color="seurat_clusters")

scv.pl.velocity_embedding(adata, basis="umap", color="seurat_clusters", arrow_length=3, arrow_size=2, dpi=120)

scv.tl.recover_dynamics(adata)
scv.tl.latent_time(adata)
scv.pl.scatter(adata, color="latent_time", color_map="gnuplot")

top_genes = adata.var["fit_likelihood"].sort_values(ascending=False).index[:300]
scv.pl.heatmap(adata, var_names=top_genes, sortby="latent_time", col_color="seurat_clusters", n_convolve=100)

================================================ FILE: docs/scvelo.md ================================================ Estimating RNA Velocity using Seurat and scVelo ================ Compiled: June 10, 2020 This vignette demonstrates analysing RNA Velocity quantifications stored in a Seurat object using scVelo. If you use scVelo in your work, please cite: > *Generalizing RNA velocity to transient cell states through dynamical > modeling* > > Volker Bergen, Marius Lange, Stefan Peidli, F. Alexander Wolf & Fabian > J. Theis > > doi: [10.1101/820936](https://doi.org/10.1101/820936) > > Website: Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [scVelo](https://scvelo.readthedocs.io/installation.html) - [SeuratDisk](https://mojaveazure.github.io/seurat-disk/#installation) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) ``` r library(Seurat) library(SeuratDisk) library(SeuratWrappers) ``` ``` r # If you don't have velocyto's example mouse bone marrow dataset, download with the CURL command # curl::curl_download(url = 'http://pklab.med.harvard.edu/velocyto/mouseBM/SCG71.loom', destfile # = '~/Downloads/SCG71.loom') ldat <- ReadVelocity(file = "~/Downloads/SCG71.loom") bm <- as.Seurat(x = ldat) bm[["RNA"]] <- bm[["spliced"]] bm <- SCTransform(bm) bm <- RunPCA(bm) bm <- RunUMAP(bm, dims = 1:20) bm <- FindNeighbors(bm, dims = 1:20) bm <- FindClusters(bm) DefaultAssay(bm) <- "RNA" SaveH5Seurat(bm, filename = "mouseBM.h5Seurat") Convert("mouseBM.h5Seurat", dest = "h5ad") ``` ``` python # In Python import scvelo as scv adata = scv.read("mouseBM.h5ad") adata ``` ## AnnData object with n_obs × n_vars = 6667 × 24421 ## obs: 'orig.ident', 'nCount_spliced', 'nFeature_spliced', 'nCount_unspliced', 'nFeature_unspliced', 'nCount_ambiguous', 'nFeature_ambiguous', 'nCount_RNA', 'nFeature_RNA', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.8', 'seurat_clusters' ## var: 'features', 'ambiguous_features', 'spliced_features', 'unspliced_features' ## obsm: 'X_umap' ## layers: 'ambiguous', 'spliced', 'unspliced' ``` python scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000) ``` ``` python scv.pp.moments(adata, n_pcs=30, n_neighbors=30) ``` ``` python scv.tl.velocity(adata) ``` ``` python scv.tl.velocity_graph(adata) ``` ``` python scv.pl.velocity_embedding_stream(adata, basis="umap", color="seurat_clusters") ``` ``` python scv.pl.velocity_embedding(adata, basis="umap", color="seurat_clusters", arrow_length=3, arrow_size=2, dpi=120) ``` ``` python scv.tl.recover_dynamics(adata) ``` ``` python scv.tl.latent_time(adata) ``` ``` python scv.pl.scatter(adata, color="latent_time", color_map="gnuplot") ``` ``` python top_genes = adata.var["fit_likelihood"].sort_values(ascending=False).index[:300] scv.pl.heatmap(adata, var_names=top_genes, sortby="latent_time", col_color="seurat_clusters", n_convolve=100) ``` ================================================ FILE: docs/tricycle.Rmd ================================================ --- title: "Running estimate_cycle_position from tricycle on Seurat Objects" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: github_document: html_preview: true toc: true toc_depth: 3 fig_width: 16 html_document: df_print: kable theme: united fig_height: 5 fig_width: 16 out_height: 4 --- This vignette demonstrates the use of the estimate_cycle_position from the tricycle package on Seurat objects. > *Universal prediction of cell cycle position using transfer learning* > > Shijie C. Zheng, Genevieve Stein-O’Brien, Jonathan J. Augustin, Jared Slosberg, Giovanni A. Carosso, Briana Winer, Gloria Shin, Hans T. Bjornsson, Loyal A. Goff, Kasper D. Hansen > > bioRxiv, 2021. > > doi: [10.1101/2021.04.06.438463](https://doi.org/10.1101/2021.04.06.438463) > > Bioconductor: https://www.bioconductor.org/packages/release/bioc/html/tricycle.html ```{r setup, include=FALSE} knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE ) ``` Prerequisites to install: * [Seurat](https://satijalab.org/seurat/install) * [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) * [tricycle](https://www.bioconductor.org/packages/release/bioc/html/tricycle.html) ```{r install.deps, include = FALSE} # SeuratWrappers:::CheckPackage(package = 'tricycle', repository = 'bioconductor') if (!require(tricycle)) remotes::install_github(repo = 'hansenlab/tricycle') ``` ```{r packages} library(Seurat) library(SeuratWrappers) library(tricycle) ``` ## Introduction The Biocondutor package [tricycle](https://www.bioconductor.org/packages/release/bioc/html/tricycle.html) infers cell cycle position for a single-cell RNA-seq dataset. Here, we show the implementation of **main** function of tricycle, estimate_cycle_position, on the Seurat objects. More information can be found at [tricycle](https://www.bioconductor.org/packages/release/bioc/html/tricycle.html). ## Loading examle data and making Seurat object ```{r seuratobj, eval = TRUE, echo = TRUE} data(neurosphere_example, package = "tricycle") neurosphere_example <- as.Seurat(neurosphere_example) neurosphere_example ``` Note that after converting the SingleCellExperiment object to Seurat object, the original "logcounts" assay is saved as a slot with name "data" in Seurat default Assay. ## Inferring the cell cycle position The `Runtricycle()` function in the SeuratWrappers package first project the data into the cell cycle embeddings using the internal reference in tricycle package, and then estimate the cell cycle position. The estimated cell cycle position is bound between 0 and 2pi. Note that we strive to get high resolution of cell cycle state, and we think the continuous position is more appropriate when describing the cell cycle. However, to help users understand the position variable, we also note that users can approximately relate 0.5pi to be the start of S stage, pi to be the start of G2M stage, 1.5pi to be the middle of M stage, and 1.75pi-0.25pi to be G1/G0 stage. ```{r run, eval = TRUE, echo = TRUE} neurosphere_example <- Runtricycle(object = neurosphere_example, slot = "data", reduction.name = "tricycleEmbedding", reduction.key = "tricycleEmbedding_", gname = NULL, gname.type = "ENSEMBL", species = "mouse") ``` ## Visualizing the results We could extract the cell cycle embedding and make a scatter plot of the embeddings colored by the position inference. And we also extract the expression level of gene Top2a for accessing the performance, described below. ```{r extract, eval = TRUE, echo = TRUE} plot.df <- FetchData(object = neurosphere_example, vars = c("tricycleEmbedding_1", "tricycleEmbedding_2", "tricyclePosition", "ENSMUSG00000020914")) names(plot.df)[4] <- "Top2a" ``` Let us plot out the cell cycle embedding. You could also plot other embeddings, such as T_SNE or UMAP with points colored by the cell cycle position. ```{r plotemb, eval = TRUE, echo = TRUE, fig.width = 10, fig.height = 7} library(ggplot2) library(cowplot) p <- tricycle:::.plot_emb_circle_scale(emb.m = plot.df[, 1:2], color.value = plot.df$tricyclePosition, color_by = "tricyclePosition", point.size = 3.5, point.alpha = 0.9 ) legend <- circle_scale_legend(text.size = 5, alpha = 0.9) plot_grid(p, legend, ncol = 2, rel_widths = c(1, 0.4)) ``` ## Assessing performance We have two ways of (quickly) assessing whether triCycle works. They are 1. Look at the projection of the data into the cell cycle embedding. 2. Look at the expression of key genes as a function of cell cycle position. Plotting the projection of the data into the cell cycle embedding is shown above. Our observation is that deeper sequenced data will have a more clearly ellipsoid pattern with an empty interior. As sequencing depth decreases, the radius of the ellipsoid decreases until the empty interior disappears. So the absence of an interior does not mean the method does not work. It is more important to inspect a couple of genes as a function of cell cycle position. We tend to use Top2a which is highly expressed and therefore "plottable" in every dataset. Other candidates are for example Smc2. To plot this data, we provide a convenient function `fit_periodic_loess()` to fit a loess line between the cyclic variable $\theta$ and other response variables. This fitting is done by making `theta.v` 3 periods `(c(theta.v - 2 * pi, theta.v, theta.v + 2 * pi))` and repeating `y` 3 times. Only the fitted values corresponding to original `theta.v` will be returned. In this example, we show how well the expression of the cell cycle marker gene *Top2a* change along $\theta$. ```{r loess, message = TRUE} fit.l <- fit_periodic_loess(neurosphere_example$tricyclePosition, plot.df$Top2a, plot = TRUE, x_lab = "Cell cycle position \u03b8", y_lab = "log2(Top2a)", fig.title = paste0("Expression of Top2a along \u03b8 (n=", ncol(neurosphere_example), ")")) names(fit.l) fit.l$fig + theme_bw(base_size = 14) ``` For Top2a we expect peak expression around $\pi$. ## Plot out the kernel density Another useful function is *plot_ccposition_den*, which computes kernel density of $\theta$ conditioned on a phenotype using von Mises distribution. The ouput figures are provided in two flavors, polar coordinates and Cartesian coordinates. This could be useful when comparing different cell types, treatments, or just stages. (Because we use a very small dataset here as example, we set the bandwith, i.e. the concentration parameter of the von Mises distribution as 10 to get a smooth line.) ```{r density, message = TRUE} plot_ccposition_den(neurosphere_example$tricyclePosition, neurosphere_example$sample, 'sample', bw = 10, fig.title = "Kernel density of \u03b8") + theme_bw(base_size = 14) plot_ccposition_den(neurosphere_example$tricyclePosition, neurosphere_example$sample, 'sample', type = "circular", bw = 10, fig.title = "Kernel density of \u03b8") + theme_bw(base_size = 14) ``` ## Resoures for tricycle More information about constructing your own reference, other usages and running tricycle outside of the Seurat environment can be found at [tricycle](https://www.bioconductor.org/packages/release/bioc/html/tricycle.html). ================================================ FILE: docs/tricycle.html ================================================ Running estimate_cycle_position from tricycle on Seurat Objects

This vignette demonstrates the use of the estimate_cycle_position from the tricycle package on Seurat objects.

Universal prediction of cell cycle position using transfer learning

Shijie C. Zheng, Genevieve Stein-O’Brien, Jonathan J. Augustin, Jared Slosberg, Giovanni A. Carosso, Briana Winer, Gloria Shin, Hans T. Bjornsson, Loyal A. Goff, Kasper D. Hansen

bioRxiv, 2021.

doi: 10.1101/2021.04.06.438463

Bioconductor: https://www.bioconductor.org/packages/release/bioc/html/tricycle.html

Prerequisites to install:

library(Seurat)
library(SeuratWrappers)
library(tricycle)

Introduction

The Biocondutor package tricycle infers cell cycle position for a single-cell RNA-seq dataset. Here, we show the implementation of main function of tricycle, estimate_cycle_position, on the Seurat objects. More information can be found at tricycle.

Loading examle data and making Seurat object

data(neurosphere_example, package = "tricycle")
neurosphere_example <- as.Seurat(neurosphere_example)
neurosphere_example
## An object of class Seurat 
## 1500 features across 400 samples within 1 assay 
## Active assay: RNA (1500 features, 0 variable features)

Note that after converting the SingleCellExperiment object to Seurat object, the original “logcounts” assay is saved as a slot with name “data” in Seurat default Assay.

Inferring the cell cycle position

The Runtricycle() function in the SeuratWrappers package first project the data into the cell cycle embeddings using the internal reference in tricycle package, and then estimate the cell cycle position. The estimated cell cycle position is bound between 0 and 2pi. Note that we strive to get high resolution of cell cycle state, and we think the continuous position is more appropriate when describing the cell cycle. However, to help users understand the position variable, we also note that users can approximately relate 0.5pi to be the start of S stage, pi to be the start of G2M stage, 1.5pi to be the middle of M stage, and 1.75pi-0.25pi to be G1/G0 stage.

neurosphere_example <- Runtricycle(object = neurosphere_example, slot = "data", reduction.name = "tricycleEmbedding", 
    reduction.key = "tricycleEmbedding_", gname = NULL, gname.type = "ENSEMBL", species = "mouse")

Visualizing the results

We could extract the cell cycle embedding and make a scatter plot of the embeddings colored by the position inference. And we also extract the expression level of gene Top2a for accessing the performance, described below.

plot.df <- FetchData(object = neurosphere_example, vars = c("tricycleEmbedding_1", "tricycleEmbedding_2", 
    "tricyclePosition", "ENSMUSG00000020914"))
names(plot.df)[4] <- "Top2a"

Let us plot out the cell cycle embedding. You could also plot other embeddings, such as T_SNE or UMAP with points colored by the cell cycle position.

library(ggplot2)
library(cowplot)
p <- tricycle:::.plot_emb_circle_scale(emb.m = plot.df[, 1:2], color.value = plot.df$tricyclePosition, 
    color_by = "tricyclePosition", point.size = 3.5, point.alpha = 0.9)
legend <- circle_scale_legend(text.size = 5, alpha = 0.9)
plot_grid(p, legend, ncol = 2, rel_widths = c(1, 0.4))

Assessing performance

We have two ways of (quickly) assessing whether triCycle works. They are

  1. Look at the projection of the data into the cell cycle embedding.
  2. Look at the expression of key genes as a function of cell cycle position.

Plotting the projection of the data into the cell cycle embedding is shown above. Our observation is that deeper sequenced data will have a more clearly ellipsoid pattern with an empty interior. As sequencing depth decreases, the radius of the ellipsoid decreases until the empty interior disappears. So the absence of an interior does not mean the method does not work.

It is more important to inspect a couple of genes as a function of cell cycle position. We tend to use Top2a which is highly expressed and therefore “plottable” in every dataset. Other candidates are for example Smc2. To plot this data, we provide a convenient function fit_periodic_loess() to fit a loess line between the cyclic variable \(\theta\) and other response variables. This fitting is done by making theta.v 3 periods (c(theta.v - 2 * pi, theta.v, theta.v + 2 * pi)) and repeating y 3 times. Only the fitted values corresponding to original theta.v will be returned. In this example, we show how well the expression of the cell cycle marker gene Top2a change along \(\theta\).

fit.l <- fit_periodic_loess(neurosphere_example$tricyclePosition, plot.df$Top2a, plot = TRUE, x_lab = "Cell cycle position θ", 
    y_lab = "log2(Top2a)", fig.title = paste0("Expression of Top2a along θ (n=", ncol(neurosphere_example), 
        ")"))
names(fit.l)
## [1] "fitted"   "residual" "pred.df"  "loess.o"  "rsquared" "fig"
fit.l$fig + theme_bw(base_size = 14)

For Top2a we expect peak expression around \(\pi\).

Plot out the kernel density

Another useful function is plot_ccposition_den, which computes kernel density of \(\theta\) conditioned on a phenotype using von Mises distribution. The ouput figures are provided in two flavors, polar coordinates and Cartesian coordinates. This could be useful when comparing different cell types, treatments, or just stages. (Because we use a very small dataset here as example, we set the bandwith, i.e. the concentration parameter of the von Mises distribution as 10 to get a smooth line.)

plot_ccposition_den(neurosphere_example$tricyclePosition, neurosphere_example$sample, "sample", 
    bw = 10, fig.title = "Kernel density of θ") + theme_bw(base_size = 14)

plot_ccposition_den(neurosphere_example$tricyclePosition, neurosphere_example$sample, "sample", 
    type = "circular", bw = 10, fig.title = "Kernel density of θ") + theme_bw(base_size = 14)

Resoures for tricycle

More information about constructing your own reference, other usages and running tricycle outside of the Seurat environment can be found at tricycle.

================================================ FILE: docs/tricycle.md ================================================ Running estimate\_cycle\_position from tricycle on Seurat Objects ================ Compiled: July 09, 2021 - [Introduction](#introduction) - [Loading examle data and making Seurat object](#loading-examle-data-and-making-seurat-object) - [Inferring the cell cycle position](#inferring-the-cell-cycle-position) - [Visualizing the results](#visualizing-the-results) - [Assessing performance](#assessing-performance) - [Plot out the kernel density](#plot-out-the-kernel-density) - [Resoures for tricycle](#resoures-for-tricycle) This vignette demonstrates the use of the estimate\_cycle\_position from the tricycle package on Seurat objects. > *Universal prediction of cell cycle position using transfer learning* > > Shijie C. Zheng, Genevieve Stein-O’Brien, Jonathan J. Augustin, Jared > Slosberg, Giovanni A. Carosso, Briana Winer, Gloria Shin, Hans T. > Bjornsson, Loyal A. Goff, Kasper D. Hansen > > bioRxiv, 2021. > > doi: > [10.1101/2021.04.06.438463](https://doi.org/10.1101/2021.04.06.438463) > > Bioconductor: > Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) - [tricycle](https://www.bioconductor.org/packages/release/bioc/html/tricycle.html) ``` r library(Seurat) library(SeuratWrappers) library(tricycle) ``` ## Introduction The Biocondutor package [tricycle](https://www.bioconductor.org/packages/release/bioc/html/tricycle.html) infers cell cycle position for a single-cell RNA-seq dataset. Here, we show the implementation of **main** function of tricycle, estimate\_cycle\_position, on the Seurat objects. More information can be found at [tricycle](https://www.bioconductor.org/packages/release/bioc/html/tricycle.html). ## Loading examle data and making Seurat object ``` r data(neurosphere_example, package = "tricycle") neurosphere_example <- as.Seurat(neurosphere_example) neurosphere_example ``` ## An object of class Seurat ## 1500 features across 400 samples within 1 assay ## Active assay: RNA (1500 features, 0 variable features) Note that after converting the SingleCellExperiment object to Seurat object, the original “logcounts” assay is saved as a slot with name “data” in Seurat default Assay. ## Inferring the cell cycle position The `Runtricycle()` function in the SeuratWrappers package first project the data into the cell cycle embeddings using the internal reference in tricycle package, and then estimate the cell cycle position. The estimated cell cycle position is bound between 0 and 2pi. Note that we strive to get high resolution of cell cycle state, and we think the continuous position is more appropriate when describing the cell cycle. However, to help users understand the position variable, we also note that users can approximately relate 0.5pi to be the start of S stage, pi to be the start of G2M stage, 1.5pi to be the middle of M stage, and 1.75pi-0.25pi to be G1/G0 stage. ``` r neurosphere_example <- Runtricycle(object = neurosphere_example, slot = "data", reduction.name = "tricycleEmbedding", reduction.key = "tricycleEmbedding_", gname = NULL, gname.type = "ENSEMBL", species = "mouse") ``` ## Visualizing the results We could extract the cell cycle embedding and make a scatter plot of the embeddings colored by the position inference. And we also extract the expression level of gene Top2a for accessing the performance, described below. ``` r plot.df <- FetchData(object = neurosphere_example, vars = c("tricycleEmbedding_1", "tricycleEmbedding_2", "tricyclePosition", "ENSMUSG00000020914")) names(plot.df)[4] <- "Top2a" ``` Let us plot out the cell cycle embedding. You could also plot other embeddings, such as T\_SNE or UMAP with points colored by the cell cycle position. ``` r library(ggplot2) library(cowplot) p <- tricycle:::.plot_emb_circle_scale(emb.m = plot.df[, 1:2], color.value = plot.df$tricyclePosition, color_by = "tricyclePosition", point.size = 3.5, point.alpha = 0.9) legend <- circle_scale_legend(text.size = 5, alpha = 0.9) plot_grid(p, legend, ncol = 2, rel_widths = c(1, 0.4)) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/tricycle_files/figure-gfm/plotemb-1.png) ## Assessing performance We have two ways of (quickly) assessing whether triCycle works. They are 1. Look at the projection of the data into the cell cycle embedding. 2. Look at the expression of key genes as a function of cell cycle position. Plotting the projection of the data into the cell cycle embedding is shown above. Our observation is that deeper sequenced data will have a more clearly ellipsoid pattern with an empty interior. As sequencing depth decreases, the radius of the ellipsoid decreases until the empty interior disappears. So the absence of an interior does not mean the method does not work. It is more important to inspect a couple of genes as a function of cell cycle position. We tend to use Top2a which is highly expressed and therefore “plottable” in every dataset. Other candidates are for example Smc2. To plot this data, we provide a convenient function `fit_periodic_loess()` to fit a loess line between the cyclic variable \(\theta\) and other response variables. This fitting is done by making `theta.v` 3 periods `(c(theta.v - 2 * pi, theta.v, theta.v + 2 * pi))` and repeating `y` 3 times. Only the fitted values corresponding to original `theta.v` will be returned. In this example, we show how well the expression of the cell cycle marker gene *Top2a* change along \(\theta\). ``` r fit.l <- fit_periodic_loess(neurosphere_example$tricyclePosition, plot.df$Top2a, plot = TRUE, x_lab = "Cell cycle position θ", y_lab = "log2(Top2a)", fig.title = paste0("Expression of Top2a along θ (n=", ncol(neurosphere_example), ")")) names(fit.l) ``` ## [1] "fitted" "residual" "pred.df" "loess.o" "rsquared" "fig" ``` r fit.l$fig + theme_bw(base_size = 14) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/tricycle_files/figure-gfm/loess-1.png) For Top2a we expect peak expression around \(\pi\). ## Plot out the kernel density Another useful function is *plot\_ccposition\_den*, which computes kernel density of \(\theta\) conditioned on a phenotype using von Mises distribution. The ouput figures are provided in two flavors, polar coordinates and Cartesian coordinates. This could be useful when comparing different cell types, treatments, or just stages. (Because we use a very small dataset here as example, we set the bandwith, i.e. the concentration parameter of the von Mises distribution as 10 to get a smooth line.) ``` r plot_ccposition_den(neurosphere_example$tricyclePosition, neurosphere_example$sample, "sample", bw = 10, fig.title = "Kernel density of θ") + theme_bw(base_size = 14) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/tricycle_files/figure-gfm/density-1.png) ``` r plot_ccposition_den(neurosphere_example$tricyclePosition, neurosphere_example$sample, "sample", type = "circular", bw = 10, fig.title = "Kernel density of θ") + theme_bw(base_size = 14) ``` ![](/__w/seurat-wrappers/seurat-wrappers/test-build/tricycle_files/figure-gfm/density-2.png) ## Resoures for tricycle More information about constructing your own reference, other usages and running tricycle outside of the Seurat environment can be found at [tricycle](https://www.bioconductor.org/packages/release/bioc/html/tricycle.html). ================================================ FILE: docs/velocity.Rmd ================================================ --- title: "Estimating RNA Velocity using Seurat" date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' output: github_document: html_preview: true toc: false html_document: df_print: kable theme: united --- This vignette demonstrates analysing RNA Velocity quantifications stored in a Seurat object. Parameters are based off of the [RNA Velocity tutorial](http://pklab.med.harvard.edu/velocyto/notebooks/R/SCG71.nb.html). If you use velocyto in your work, please cite: > *RNA velocity of single cells* > > Gioele La Manno, Ruslan Soldatov, Amit Zeisel, Emelie Braun, Hannah Hochgerner, Viktor Petukhov, Katja Lidschreiber, Maria E. Kastriti, Peter Lönnerberg, Alessandro Furlan, Jean Fan, Lars E. Borm, Zehua Liu, David van Bruggen, Jimin Guo, Xiaoling He, Roger Barker, Erik Sundström, Gonçalo Castelo-Branco, Patrick Cramer, Igor Adameyko, Sten Linnarsson & Peter V. Kharchenko > > doi: [10.1038/s41586-018-0414-6](https://doi.org/10.1038/s41586-018-0414-6) > > Website: https://velocyto.org ```{r setup, include=FALSE} knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, fig.height = 10, fig.width = 16 ) ``` Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [velocyto.R](https://github.com/velocyto-team/velocyto.R) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) ```{r packages} library(Seurat) library(velocyto.R) library(SeuratWrappers) ``` ```{r load_data, results='hide', cache=TRUE} # If you don't have velocyto's example mouse bone marrow dataset, download with the CURL command # curl::curl_download(url = "http://pklab.med.harvard.edu/velocyto/mouseBM/SCG71.loom", destfile = '~/Downloads/SCG71.loom') ldat <- ReadVelocity(file = '~/Downloads/SCG71.loom') bm <- as.Seurat(x = ldat) bm <- SCTransform(object = bm, assay = 'spliced') bm <- RunPCA(object = bm, verbose = FALSE) bm <- FindNeighbors(object = bm, dims = 1:20) bm <- FindClusters(object = bm) bm <- RunUMAP(object = bm, dims = 1:20) bm <- RunVelocity( object = bm, deltaT = 1, kCells = 25, fit.quantile = 0.02 ) ident.colors <- scales::hue_pal()(n = length(x = levels(x = bm))) names(x = ident.colors) <- levels(x = bm) cell.colors <- ident.colors[Idents(object = bm)] names(x = cell.colors) <- colnames(x = bm) show.velocity.on.embedding.cor( emb = Embeddings(object = bm, reduction = 'umap'), vel = Tool(object = bm, slot = 'RunVelocity'), n = 200, scale = 'sqrt', cell.colors = ac(x = cell.colors, alpha = 0.5), cex = 0.8, arrow.scale = 3, show.grid.flow = TRUE, min.grid.cell.mass = 0.5, grid.n = 40, arrow.lwd = 1, do.par = FALSE, cell.border.alpha = 0.1 ) ``` ================================================ FILE: docs/velocity.html ================================================ Estimating RNA Velocity using Seurat

This vignette demonstrates analysing RNA Velocity quantifications stored in a Seurat object. Parameters are based off of the RNA Velocity tutorial. If you use velocyto in your work, please cite:

RNA velocity of single cells

Gioele La Manno, Ruslan Soldatov, Amit Zeisel, Emelie Braun, Hannah Hochgerner, Viktor Petukhov, Katja Lidschreiber, Maria E. Kastriti, Peter Lönnerberg, Alessandro Furlan, Jean Fan, Lars E. Borm, Zehua Liu, David van Bruggen, Jimin Guo, Xiaoling He, Roger Barker, Erik Sundström, Gonçalo Castelo-Branco, Patrick Cramer, Igor Adameyko, Sten Linnarsson & Peter V. Kharchenko

doi: 10.1038/s41586-018-0414-6

Website: https://velocyto.org

Prerequisites to install:

library(Seurat)
library(velocyto.R)
library(SeuratWrappers)
# If you don't have velocyto's example mouse bone marrow dataset, download with the CURL command
# curl::curl_download(url = 'http://pklab.med.harvard.edu/velocyto/mouseBM/SCG71.loom', destfile
# = '~/Downloads/SCG71.loom')
ldat <- ReadVelocity(file = "~/Downloads/SCG71.loom")
bm <- as.Seurat(x = ldat)
bm <- SCTransform(object = bm, assay = "spliced")
bm <- RunPCA(object = bm, verbose = FALSE)
bm <- FindNeighbors(object = bm, dims = 1:20)
bm <- FindClusters(object = bm)
bm <- RunUMAP(object = bm, dims = 1:20)
bm <- RunVelocity(object = bm, deltaT = 1, kCells = 25, fit.quantile = 0.02)
ident.colors <- (scales::hue_pal())(n = length(x = levels(x = bm)))
names(x = ident.colors) <- levels(x = bm)
cell.colors <- ident.colors[Idents(object = bm)]
names(x = cell.colors) <- colnames(x = bm)
show.velocity.on.embedding.cor(emb = Embeddings(object = bm, reduction = "umap"), vel = Tool(object = bm, 
    slot = "RunVelocity"), n = 200, scale = "sqrt", cell.colors = ac(x = cell.colors, alpha = 0.5), 
    cex = 0.8, arrow.scale = 3, show.grid.flow = TRUE, min.grid.cell.mass = 0.5, grid.n = 40, arrow.lwd = 1, 
    do.par = FALSE, cell.border.alpha = 0.1)

================================================ FILE: docs/velocity.md ================================================ Estimating RNA Velocity using Seurat ================ Compiled: July 15, 2019 This vignette demonstrates analysing RNA Velocity quantifications stored in a Seurat object. Parameters are based off of the [RNA Velocity tutorial](http://pklab.med.harvard.edu/velocyto/notebooks/R/SCG71.nb.html). If you use velocyto in your work, please cite: > *RNA velocity of single cells* > > Gioele La Manno, Ruslan Soldatov, Amit Zeisel, Emelie Braun, Hannah Hochgerner, Viktor Petukhov, Katja Lidschreiber, Maria E. Kastriti, Peter Lönnerberg, Alessandro Furlan, Jean Fan, Lars E. Borm, Zehua Liu, David van Bruggen, Jimin Guo, Xiaoling He, Roger Barker, Erik Sundström, Gonçalo Castelo-Branco, Patrick Cramer, Igor Adameyko, Sten Linnarsson & Peter V. Kharchenko > > doi: [10.1038/s41586-018-0414-6](https://doi.org/10.1038/s41586-018-0414-6) > > Website: Prerequisites to install: - [Seurat](https://satijalab.org/seurat/install) - [velocyto.R](https://github.com/velocyto-team/velocyto.R) - [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) ``` r library(Seurat) library(velocyto.R) library(SeuratWrappers) ``` ``` r # If you don't have velocyto's example mouse bone marrow dataset, download with the CURL command # curl::curl_download(url = 'http://pklab.med.harvard.edu/velocyto/mouseBM/SCG71.loom', destfile # = '~/Downloads/SCG71.loom') ldat <- ReadVelocity(file = "~/Downloads/SCG71.loom") bm <- as.Seurat(x = ldat) bm <- SCTransform(object = bm, assay = "spliced") bm <- RunPCA(object = bm, verbose = FALSE) bm <- FindNeighbors(object = bm, dims = 1:20) bm <- FindClusters(object = bm) bm <- RunUMAP(object = bm, dims = 1:20) bm <- RunVelocity(object = bm, deltaT = 1, kCells = 25, fit.quantile = 0.02) ident.colors <- (scales::hue_pal())(n = length(x = levels(x = bm))) names(x = ident.colors) <- levels(x = bm) cell.colors <- ident.colors[Idents(object = bm)] names(x = cell.colors) <- colnames(x = bm) show.velocity.on.embedding.cor(emb = Embeddings(object = bm, reduction = "umap"), vel = Tool(object = bm, slot = "RunVelocity"), n = 200, scale = "sqrt", cell.colors = ac(x = cell.colors, alpha = 0.5), cex = 0.8, arrow.scale = 3, show.grid.flow = TRUE, min.grid.cell.mass = 0.5, grid.n = 40, arrow.lwd = 1, do.par = FALSE, cell.border.alpha = 0.1) ``` ![](velocity_files/figure-markdown_github/load_data-1.png) ================================================ FILE: man/ALRAChooseKPlot.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/alra.R \name{ALRAChooseKPlot} \alias{ALRAChooseKPlot} \title{ALRA Approximate Rank Selection Plot} \usage{ ALRAChooseKPlot(object, start = 0, combine = TRUE) } \arguments{ \item{object}{Seurat object} \item{start}{Index to start plotting singular value spacings from. The transition from "signal" to "noise" in the is hard to see because the first singular value spacings are so large. Nicer visualizations result from skipping the first few. If set to 0 (default) starts from k/2.} \item{combine}{Combine plots into a single gg object; note that if TRUE, themeing will not work when plotting multiple features} } \value{ A list of 3 ggplot objects splotting the singular values, the spacings of the singular values, and the p-values of the singular values. } \description{ Plots the results of the approximate rank selection process for ALRA. } \seealso{ \code{\link{RunALRA}} } \author{ Jun Zhao, George Linderman } ================================================ FILE: man/CellBrowser.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/cellbrowser.R \name{CellBrowser} \alias{CellBrowser} \alias{ExportToCellbrowser} \title{Export \code{Seurat} objects for UCSC cell browser and stop open cell browser instances from R} \usage{ ExportToCellbrowser( object, dir, dataset.name = Project(object = object), reductions = NULL, markers.file = NULL, cluster.field = NULL, cb.dir = NULL, port = NULL, use.mtx = FALSE, meta.fields = NULL, meta.fields.names = NULL, matrix.slot = "counts", markers.n = 100, skip.markers = FALSE, skip.expr.matrix = FALSE, skip.metadata = FALSE, skip.reductions = FALSE ) } \arguments{ \item{object}{Seurat object} \item{dir}{path to directory where to save exported files. These are: exprMatrix.tsv, tsne.coords.tsv, meta.tsv, markers.tsv and a default cellbrowser.conf} \item{dataset.name}{name of the dataset. Defaults to Seurat project name} \item{reductions}{vector of reduction names to export, defaults to all reductions.} \item{markers.file}{path to file with marker genes. By defaults, marker are searched in the object itself as misc$markers. If none are supplied in object or via this argument, they are recalculated with \code{FindAllMarkers}} \item{cluster.field}{name of the metadata field containing cell cluster} \item{cb.dir}{path to directory where to create UCSC cellbrowser static website content root, e.g. an index.html, .json files, etc. These files can be copied to any webserver. If this is specified, the cellbrowser package has to be accessible from R via reticulate.} \item{port}{on which port to run UCSC cellbrowser webserver after export} \item{use.mtx}{export the matrix in .mtx.gz format. Default is False, unless the matrix is bigger than R's maximum matrix size.} \item{meta.fields}{vector of meta fields to export, default is all.} \item{meta.fields.names}{vector meta field names to show in UI. Must have same length as meta.fields. Default is meta.fields.} \item{matrix.slot}{matrix to use, default is 'counts'} \item{markers.n}{if no markers were supplied, FindAllMarkers is run. This parameter indicates how many markers to calculate, default is 100} \item{skip.markers}{whether to skip exporting markers} \item{skip.expr.matrix}{whether to skip exporting expression matrix} \item{skip.metadata}{whether to skip exporting metadata} \item{skip.reductions}{whether to skip exporting reductions} \item{...}{specifies the metadata fields to export. To supply a field and its human readable name, pass name as \code{field="name"} parameter.} } \value{ This function exports Seurat object as a set of tsv files to \code{dir} directory, copying the \code{markers.file} if it is passed. It also creates the default \code{cellbrowser.conf} in the directory. This directory could be read by \code{cbBuild} to create a static website viewer for the dataset. If \code{cb.dir} parameter is passed, the function runs \code{cbBuild} (if it is installed) to create this static website in \code{cb.dir} directory. If \code{port} parameter is passed, it also runs the webserver for that directory and opens a browser. } \description{ Export \code{Seurat} objects for UCSC cell browser and stop open cell browser instances from R } \examples{ \dontrun{ ExportToCellbrowser(pbmc_small, dataset.name = "PBMC", dir = "out") } } \author{ Maximilian Haeussler, Nikolay Markov } ================================================ FILE: man/FastMNNIntegration.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/fast_mnn_v5.R \name{FastMNNIntegration} \alias{FastMNNIntegration} \title{Run fastMNN in Seurat 5} \usage{ FastMNNIntegration( object, assay = NULL, orig = NULL, groups = NULL, layers = NULL, scale.layer = NULL, features = 2000, new.reduction = "integrated.mnn", reduction.key = "mnn_", reconstructed.assay = "mnn.reconstructed", verbose = TRUE, ... ) } \arguments{ \item{object}{A merged seurat object} \item{assay}{Assay to use, defaults to the default assay of the first object} \item{groups}{A one-column data frame with grouping information} \item{layers}{Layers to use} \item{features}{Either a list of features to use when calculating batch correction, or a number (2000 by default) of variable features to select.} \item{reduction.key}{Key for resulting DimReduc} \item{reconstructed.assay}{Name for the assay containing the low-rank reconstruction of the expression matrix.} \item{verbose}{Print messages} \item{...}{Extra parameters passed to \code{\link[batchelor]{fastMNN}}} \item{reduction.name}{Name to store resulting DimReduc object as} } \value{ A Seurat object merged from the objects in \code{object.list} and a new DimReduc of name \code{reduction.name} (key set to \code{reduction.key}) with corrected embeddings matrix as well as the rotation matrix used for the PCA stored in the feature loadings slot. Also returns an expression matrix reconstructed from the low-rank approximation in the \code{reconstructed.assay} assay; all other metadata info \code{\link[batchelor]{fastMNN}} is stored in the \code{tool} slot, accessible with \code{\link[Seurat]{Tool}} } \description{ Run fastMNN in Seurat 5 } \note{ This function requires the \href{https://rdrr.io/github/LTLA/batchelor/}{\pkg{batchelor}} package to be installed } \examples{ \dontrun{ # Preprocessing obj <- SeuratData::LoadData("pbmcsca") obj[["RNA"]] <- split(obj[["RNA"]], f = obj$Method) obj <- NormalizeData(obj) obj <- FindVariableFeatures(obj) obj <- ScaleData(obj) obj <- RunPCA(obj) # After preprocessing, we integrate layers: obj <- IntegrateLayers(object = obj, method = FastMNNIntegration, new.reduction = 'integrated.mnn', verbose = FALSE) # We can also add parameters specific to FastMNN. # Here we set `k` to specify the number of nearest neighbors to use when identifying MNNs: obj <- IntegrateLayers(object = obj, method = FastMNNIntegration, new.reduction = 'integrated.mnn', k = 15, verbose = FALSE) } } \seealso{ \code{\link[batchelor]{fastMNN}} \code{\link[Seurat]{Tool}} } ================================================ FILE: man/LearnGraph.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/monocle3.R \name{LearnGraph} \alias{LearnGraph} \title{Run \code{link[monocle3]{learn_graph}} on a \code{\link[Seurat]{Seurat}} object} \usage{ LearnGraph(object, reduction = DefaultDimReduc(object = object), ...) } \arguments{ \item{object}{A \code{\link[Seurat]{Seurat}} object} \item{reduction}{Name of reduction to use for learning the pseudotime graph} \item{...}{Arguments passed to \code{\link[monocle3]{learn_graph}}} } \value{ A \code{\link[monocle3]{cell_data_set}} object with the pseudotime graph } \description{ Run \code{link[monocle3]{learn_graph}} on a \code{\link[Seurat]{Seurat}} object } \seealso{ \code{\link[monocle3]{learn_graph}} \code{\link[monocle3]{cell_data_set}} } ================================================ FILE: man/PlotMiQC.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/miqc.R \name{PlotMiQC} \alias{PlotMiQC} \title{Run miQC on a Seurat object} \usage{ PlotMiQC( seurat_object, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", model.slot = "flexmix_model", color.by = "miQC.probability" ) } \arguments{ \item{object}{Seurat object} } \description{ Run miQC on a Seurat object } \details{ _Function to plot the miQC mixture model stored in a Seurat object. `RunMiQC` must be run prior to plotting._ } \references{ Hippen et al. (2021) miQC: An adaptive probabilistic framework for quality control of single-cell RNA-sequencing data. bioRxiv doi: 10.1101/2021.03.03.433798 } ================================================ FILE: man/ReadAlevin.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/alevin.R \name{ReadAlevin} \alias{ReadAlevin} \title{Load alevin quantification data} \usage{ ReadAlevin(file, getMeta = FALSE, meanAndVariance = FALSE, ...) } \arguments{ \item{file}{path to \code{quants_mat.gz} file within alevin directory} \item{getMeta}{logical, option to use \code{tximeta} to programmatically obtain gene range information, default is FALSE. Ranges are stored in \code{chr}, \code{start}, and \code{end} in the \code{meta.features} slot.} \item{meanAndVariance}{logical, should mean and variance of counts be returned in \code{counts} and \code{data} slots, respectively} \item{...}{extra arguments passed to \code{tximport}, for example, \code{alevinArgs=list(filterBarcodes=TRUE)}.} } \value{ returns a Seurat object with alevin counts } \description{ A wrapper around tximport to create a \code{SeuratObject} from alevin quantification data. } \references{ Srivastava, Avi, et al. "Alevin efficiently estimates accurate gene abundances from dscRNA-seq data." Genome biology 20.1 (2019): 65. } \seealso{ \code{\link[alevin]{alevin}} } \author{ Avi Srivastava } ================================================ FILE: man/ReadVelocity.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/velocity.R \name{ReadVelocity} \alias{ReadVelocity} \title{Load RNA Velocity data from a loom file} \usage{ ReadVelocity(file, engine = "hdf5r", verbose = TRUE) } \arguments{ \item{file}{Path to loom file} \item{engine}{Method to load data data, choose from 'hdf5r' or 'h5'} \item{verbose}{Display progress updates} } \description{ This is a wrapper around \code{\link[velocyto.R]{read.loom.matrices}}, but sends messages to \code{stderr} instead of \code{stdout} (or silences messages with \code{verbose = FALSE}) } \seealso{ \code{\link[velocyto.R]{read.loom.matrices}} } ================================================ FILE: man/RunALRA.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/alra.R \name{RunALRA} \alias{RunALRA} \alias{RunALRA.default} \alias{RunALRA.Seurat} \title{Run Adaptively-thresholded Low Rank Approximation (ALRA)} \usage{ RunALRA(object, ...) \method{RunALRA}{default}( object, k = NULL, q = 10, quantile.prob = 0.001, use.mkl = FALSE, mkl.seed = -1, ... ) \method{RunALRA}{Seurat}( object, k = NULL, q = 10, quantile.prob = 0.001, use.mkl = FALSE, mkl.seed = -1, assay = NULL, slot = "data", setDefaultAssay = TRUE, genes.use = NULL, K = NULL, thresh = 6, noise.start = NULL, q.k = 2, k.only = FALSE, ... ) } \arguments{ \item{object}{An object} \item{...}{Arguments passed to other methods} \item{k}{The rank of the rank-k approximation. Set to NULL for automated choice of k.} \item{q}{The number of additional power iterations in randomized SVD when computing rank k approximation. By default, q=10.} \item{quantile.prob}{The quantile probability to use when calculating threshold. By default, quantile.prob = 0.001.} \item{use.mkl}{Use the Intel MKL based implementation of SVD. Needs to be installed from https://github.com/KlugerLab/rpca-mkl.} \item{mkl.seed}{Only relevant if use.mkl=T. Set the seed for the random generator for the Intel MKL implementation of SVD. Any number <0 will use the current timestamp. If use.mkl=F, set the seed using set.seed() function as usual.} \item{assay}{Assay to use} \item{slot}{slot to use} \item{setDefaultAssay}{If TRUE, will set imputed results as default Assay} \item{genes.use}{genes to impute} \item{K}{Number of singular values to compute when choosing k. Must be less than the smallest dimension of the matrix. Default 100 or smallest dimension.} \item{noise.start}{Index for which all smaller singular values are considered noise. Default K - 20.} \item{q.k}{Number of additional power iterations when choosing k. Default 2.} \item{k.only}{If TRUE, only computes optimal k WITHOUT performing ALRA} \item{p.val.th}{The threshold for ''significance'' when choosing k. Default 1e-10.} } \description{ Runs ALRA, a method for imputation of dropped out values in scRNA-seq data. Computes the k-rank approximation to A_norm and adjusts it according to the error distribution learned from the negative values. Described in Linderman, G. C., Zhao, J., Kluger, Y. (2018). "Zero-preserving imputation of scRNA-seq data using low rank approximation." (bioRxiv:138677) } \examples{ \dontrun{ pbmc_small # Example 1: Simple usage, with automatic choice of k. pbmc_small_alra <- RunALRA(object = pbmc_small) # Example 2: Visualize choice of k, then run ALRA # First, choose K pbmc_small_alra <- RunALRA(pbmc_small, k.only=TRUE) # Plot the spectrum, spacings, and p-values which are used to choose k ggouts <- ALRAChooseKPlot(pbmc_small_alra) do.call(gridExtra::grid.arrange, c(ggouts, nrow=1)) # Run ALRA with the chosen k pbmc_small_alra <- RunALRA(pbmc_small_alra) } } \references{ Linderman, G. C., Zhao, J., Kluger, Y. (2018). "Zero-preserving imputation of scRNA-seq data using low rank approximation." (bioRxiv:138677) } \seealso{ \code{\link{ALRAChooseKPlot}} } \author{ Jun Zhao, George Linderman } ================================================ FILE: man/RunBanksy.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/banksy.R \name{RunBanksy} \alias{RunBanksy} \title{Run Banksy on a Seurat Object} \usage{ RunBanksy( object, lambda, assay = "RNA", slot = "data", use_agf = FALSE, dimx = NULL, dimy = NULL, dimz = NULL, ndim = 2, features = "variable", group = NULL, split.scale = TRUE, k_geom = 15, n = 2, sigma = 1.5, alpha = 0.05, k_spatial = 10, spatial_mode = "kNN_median", assay_name = "BANKSY", M = NULL, verbose = TRUE ) } \arguments{ \item{object}{A Seurat object} \item{lambda}{(numeric) Spatial weight parameter} \item{assay}{(character) Assay in Seurat object to use} \item{slot}{(character) Slot in Seurat assay to use} \item{use_agf}{(boolean) Whether to use the AGF} \item{dimx}{(character) Column name of spatial x dimension (must be in metadata)} \item{dimy}{(character) Column name of spatial y dimension (must be in metadata)} \item{dimz}{(character) Column name of spatial z dimension (must be in metadata)} \item{ndim}{(integer) Number of spatial dimensions to extract} \item{features}{(character) Features to compute. Can be 'all', 'variable' or a vector of feature names} \item{group}{(character) Column name of a grouping variable (must be in metadata)} \item{split.scale}{(boolean) Whether to separate scaling by group} \item{k_geom}{(numeric) kNN parameter - number of neighbors to use} \item{n}{(numeric) kNN_rn parameter - exponent of radius} \item{sigma}{(numeric) rNN parameter - standard deviation of Gaussian kernel} \item{alpha}{(numeric) rNN parameter - determines radius used} \item{k_spatial}{(numeric) rNN parameter - number of neighbors to use} \item{spatial_mode}{(character) Kernel for neighborhood computation \itemize{ \item{kNN_median: k-nearest neighbors with median-scaled Gaussian kernel} \item{kNN_r: k-nearest neighbors with $1/r$ kernel} \item{kNN_rn: k-nearest neighbors with $1/r^n$ kernel} \item{kNN_rank: k-nearest neighbors with rank Gaussian kernel} \item{kNN_unif: k-nearest neighbors wth uniform kernel} \item{rNN_gauss: radial nearest neighbors with Gaussian kernel} }} \item{assay_name}{(character) Name for Banksy assay in Seurat object} \item{M}{(numeric) Advanced usage. Highest azimuthal harmonic} \item{verbose}{(boolean) Print messages} } \value{ A Seurat object with new assay holding a Banksy matrix } \description{ Run Banksy on a Seurat Object } \references{ Vipul Singhal, Nigel Chou et. al. BANKSY: A Spatial Omics Algorithm that Unifies Cell Type Clustering and Tissue Domain Segmentation } \seealso{ \code{\link[Banksy]{ComputeBanksy}} } \author{ Joseph Lee, Vipul Singhal } ================================================ FILE: man/RunCoGAPS.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/cogaps.R \name{RunCoGAPS} \alias{RunCoGAPS} \title{Run CoGAPs on a Seurat object} \usage{ RunCoGAPS( object, assay = NULL, slot = "counts", params = NULL, temp.file = NULL, reduction.name = "CoGAPS", reduction.key = "CoGAPS_", ... ) } \arguments{ \item{object}{Seurat object} \item{assay}{Assay to pull data from} \item{slot}{Slot to pull data from.} \item{params}{\code{\link[CoGAPS]{CogapsParams}} object for specifying parameter settings} \item{temp.file}{Name of temporary data matrix file to create if running in a distributed mode. Setting to TRUE will generate the file name using \code{tempfile}.} \item{reduction.name}{Name of the CoGAPS reduction returned} \item{reduction.key}{Key for the CoGAPS reduction returned} } \value{ Returns a Seurat object with the CoGAPS results stored as a \code{\link{DimReduc}} object } \description{ Run CoGAPs on a Seurat object } \references{ E.J. Fertig, J. Ding, A.V. Favorov, G. Parmigiani, and M.F. Ochs (2010) CoGAPS: an integrated R/C++ package to identify overlapping patterns of activation of biological processes from expression data. Bioinformatics 26:2792-2793. } \seealso{ \code{\link[CoGAPS]{CoGAPS}} } ================================================ FILE: man/RunFastMNN.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/fast_mnn.R \name{RunFastMNN} \alias{RunFastMNN} \title{Run fastMNN} \usage{ RunFastMNN( object.list, assay = NULL, features = 2000, reduction.name = "mnn", reduction.key = "mnn_", reconstructed.assay = "mnn.reconstructed", verbose = TRUE, ... ) } \arguments{ \item{object.list}{A list of Seurat objects} \item{assay}{Assay to use, defaults to the default assay of the first object} \item{features}{Either a list of features to use when calculating batch correction, or a number (2000 by default) of variable features to select.} \item{reduction.name}{Name to store resulting DimReduc object as} \item{reduction.key}{Key for resulting DimReduc} \item{reconstructed.assay}{Name for the assay containing the low-rank reconstruction of the expression matrix.} \item{verbose}{Print messages from \code{\link[Seurat]{SelectIntegrationFeatures}}} \item{...}{Extra parameters passed to \code{\link[batchelor]{fastMNN}}} } \value{ A Seurat object merged from the objects in \code{object.list} and a new DimReduc of name \code{reduction.name} (key set to \code{reduction.key}) with corrected embeddings matrix as well as the rotation matrix used for the PCA stored in the feature loadings slot. Also returns an expression matrix reconstructed from the low-rank approximation in the \code{reconstructed.assay} assay; all other metadata info \code{\link[batchelor]{fastMNN}} is stored in the \code{tool} slot, accessible with \code{\link[Seurat]{Tool}} } \description{ Run fastMNN } \seealso{ \code{\link[batchelor]{fastMNN}} \code{\link[Seurat]{Tool}} } ================================================ FILE: man/RunGLMPCA.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/glmpca.R \name{RunGLMPCA} \alias{RunGLMPCA} \title{Run GLMPCA} \usage{ RunGLMPCA( object, L = 5, assay = NULL, features = NULL, reduction.name = "glmpca", reduction.key = "GLMPC_", verbose = TRUE, ... ) } \arguments{ \item{object}{A Seurat object} \item{L}{The number of dimensions to return (defaults to 5)} \item{assay}{Assay to use, defaults to the default assay} \item{features}{A list of features to use when performing GLM-PCA. If null, defaults to variable features.} \item{reduction.name}{Name to store resulting DimReduc object as. Defaults to glmpca} \item{reduction.key}{Key for resulting DimReduc. Defaults to GLMPC_} \item{...}{Extra parameters passed to \code{\link[glmpca]{glmpca}}} } \value{ A Seurat object containing the output of GLMPCA stored as a DimReduc object. } \description{ Run GLMPCA } \examples{ \dontrun{ pbmc_small pbmc_small <- RunGLMPCA(pbmc_small) DimPlot(pbmc_small, redunction = 'glmpca') } } \references{ Townes, W., Hicks, SC, Aryee, MJ, Irizarry, RA. (2019). "Feature selection and dimension reduction for single-cell RNA-Seq based on a multinomial model." Genome Biology. } \author{ Will Townes } ================================================ FILE: man/RunMiQC.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/miqc.R \name{RunMiQC} \alias{RunMiQC} \title{Run miQC on a Seurat object} \usage{ RunMiQC( object, percent.mt = "percent.mt", nFeature_RNA = "nFeature_RNA", posterior.cutoff = 0.75, model.type = "linear", model.slot = "flexmix_model", verbose = TRUE, backup.option = "percentile", backup.percentile = 0.99, backup.percent = 5, ... ) } \arguments{ \item{object}{Seurat object} \item{percent.mt}{(character) Name of the column in the Seurat metadata that contains the percent of reads attributed to mitochondrial genes. Defaults to "percent.mt".} \item{nFeature_RNA}{(character) Name of the column in the Seurat metadata that contains the number of reads per cell. Defaults to "nFeature_RNA".} \item{posterior.cutoff}{numeric) The posterior probability of a cell being part of the compromised distribution, a number between 0 and 1. Any cells below the appointed cutoff will be marked to keep. Defaults to 0.75.} \item{model.type}{(character) What type of model to generate. A linear mixture model ("linear") is recommended, but currently b-spline ("spline") and two-degree polynomial ("polynomial") are also supported Default = "linear".} \item{verbose}{Boolean. TRUE to show progress messages, FALSE to hide progress messages} \item{backup.option}{(character) In case flexmix fails to build a 2 cluster mixture model, what should RunMiQC do: "percent" (set miQC.keep values according to backup.percent), "percentile" (set miQC.keep values according to backup.percentile), "pass" (return original Seurat object), or "halt" (stop RunMiQC). "percent", "percentile", and "pass" are useful when processing multiple Seurat objects sequentially.} \item{backup.percentile}{(numeric) What percentile to use as cutoff in case flexmix fails to build a 2 cluster mixture model. Will only be used if backup.option is "percentile".} \item{backup.percent}{(numeric) What percent to use as cutoff in case flexmix fails to build a 2 cluster mixture model. Will only be used if backup.option is "percent".} } \value{ Returns a Seurat object with probabilities and "keep" decisions stored as "miQC.probability" and "miQC.keep" in the object metadata, respectively. } \description{ Run miQC on a Seurat object } \details{ (Copied verbatim from miQC) _Function to fit a two-distribution mixture model on a Seurat object and find those cells probabistically determined to be compromised by the mixture model._ } \references{ Hippen et al. (2021) miQC: An adaptive probabilistic framework for quality control of single-cell RNA-sequencing data. bioRxiv doi: 10.1101/2021.03.03.433798 } ================================================ FILE: man/RunOptimizeALS.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/liger.R \name{RunOptimizeALS} \alias{RunOptimizeALS} \alias{optimizeALS} \title{Run optimizeALS on a Seurat object} \usage{ RunOptimizeALS( object, k, assay = NULL, split.by = "orig.ident", lambda = 5, thresh = 1e-06, max.iters = 30, reduction.name = "iNMF_raw", reduction.key = "riNMF_", nrep = 1, H.init = NULL, W.init = NULL, V.init = NULL, rand.seed = 1, print.obj = FALSE, ... ) } \arguments{ \item{object}{A merged Seurat object} \item{k}{Inner dimension of factorization (number of factors). Run suggestK to determine appropriate value; a general rule of thumb is that a higher k will be needed for datasets with more sub-structure.} \item{assay}{Assay to use, defaults to the default assay of the first object} \item{split.by}{Attribute for splitting, defaults to "orig.ident"} \item{lambda}{Regularization parameter. Larger values penalize dataset-specific effects more strongly (ie. alignment should increase as lambda increases). Run suggestLambda to determine most appropriate value for balancing dataset alignment and agreement (default 5.0).} \item{thresh}{Convergence threshold. Convergence occurs when |obj0-obj|/(mean(obj0,obj)) < thresh. (default 1e-6)} \item{max.iters}{Maximum number of block coordinate descent iterations to perform (default 30).} \item{reduction.name}{Name to store resulting DimReduc object as} \item{reduction.key}{Key for resulting DimReduc} \item{nrep}{Number of restarts to perform (iNMF objective function is non-convex, so taking the best objective from multiple successive initializations is recommended). For easier reproducibility, this increments the random seed by 1 for each consecutive restart, so future factorizations of the same dataset can be run with one rep if necessary. (default 1)} \item{H.init}{Initial values to use for H matrices. (default NULL)} \item{W.init}{Initial values to use for W matrix (default NULL)} \item{V.init}{Initial values to use for V matrices (default NULL)} \item{rand.seed}{Random seed to allow reproducible results (default 1).} \item{print.obj}{Print objective function values after convergence (default FALSE).} \item{...}{Arguments passed to other methods} } \value{ A Seurat object with embeddings and loadings from \code{\link[liger]{optimizeALS}} stored as a DimReduc object with name \code{reduction.name} (key set to \code{reduction.key}); per-dataset feature loadings matrices stored in the \code{tool} slot, accessible with \code{\link[Seurat]{Tool}} } \description{ Run optimizeALS on a Seurat object } \seealso{ \code{\link[rliger]{optimizeALS}} \code{\link[Seurat]{Tool}} } ================================================ FILE: man/RunPaCMAP.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/pacmap.R \name{RunPaCMAP} \alias{RunPaCMAP} \alias{RunPaCMAP.Seurat} \alias{RunPaCMAP.default} \title{Run PaCMAP (Pairwise Controlled Manifold Approximation)} \usage{ RunPaCMAP(object, ...) \method{RunPaCMAP}{Seurat}( object, reduction = "pca", dims = NULL, features = NULL, assay = NULL, layer = "data", n_components = 2, n.neighbors = NULL, MN_ratio = 0.5, FP_ratio = 2, distance_method = "euclidean", lr = 1, num_iters = 250L, apply_pca = TRUE, init = "random", reduction.name = "pacmap", reduction.key = "PaCMAP_", verbose = TRUE, seed.use = 11L, ... ) \method{RunPaCMAP}{default}( object, assay = NULL, n_components = 2, n.neighbors = NULL, MN_ratio = 0.5, FP_ratio = 2, distance_method = "euclidean", lr = 1, num_iters = 250L, apply_pca = TRUE, init = "random", reduction.key = "PaCMAP_", verbose = TRUE, seed.use = 11L, ... ) } \arguments{ \item{object}{An object. This can be a Seurat object or a matrix-like object.} \item{...}{Additional arguments to be passed to the pacmap.PaCMAP function.} \item{reduction}{A character string specifying the reduction to be used as input. Default is "pca".} \item{dims}{An integer vector specifying the dimensions to be used. Default is NULL.} \item{features}{A character vector specifying the features to be used. Default is NULL.} \item{assay}{A character string specifying the assay to be used. Default is NULL.} \item{layer}{A character string specifying the layer name to be used. Default is "data".} \item{n_components}{An integer specifying the number of PaCMAP components. Default is 2.} \item{n.neighbors}{An integer specifying the number of neighbors considered in the k-Nearest Neighbor graph. Default to 10 for dataset whose sample size is smaller than 10000. For large dataset whose sample size (n) is larger than 10000, the default value is: 10 + 15 * (log10(n) - 4).} \item{MN_ratio}{A numeric value specifying the ratio of the ratio of the number of mid-near pairs to the number of neighbors. Default is 0.5.} \item{FP_ratio}{A numeric value specifying the ratio of the ratio of the number of further pairs to the number of neighbors. Default is 2.} \item{distance_method}{A character string specifying the distance metric to be used. Default is "euclidean".} \item{lr}{A numeric value specifying the learning rate of the AdaGrad optimizer. Default is 1.} \item{num_iters}{An integer specifying the number of iterations for PaCMAP optimization. Default is 450.} \item{apply_pca}{A logical value indicating whether pacmap should apply PCA to the data before constructing the k-Nearest Neighbor graph. Using PCA to preprocess the data can largely accelerate the DR process without losing too much accuracy. Notice that this option does not affect the initialization of the optimization process. Default is TRUE.} \item{init}{A character string specifying the initialization of the lower dimensional embedding. One of "pca" or "random". Default is "random".} \item{reduction.name}{A character string specifying the name of the reduction to be stored in the Seurat object. Default is "pacmap".} \item{reduction.key}{A character string specifying the prefix for the column names of the PaCMAP embeddings. Default is "PaCMAP_".} \item{verbose}{A logical value indicating whether to print verbose output. Default is TRUE.} \item{seed.use}{An integer specifying the random seed to be used. Default is 11.} \item{slot}{A character string specifying the slot name to be used. Default is "data".} } \description{ Runs PaCMAP, a method for dimensionality reduction for scRNA-seq data. data. Constructs three kinds of pairs of points: neighbor pairs (pair_neighbors), mid-near pair (pair_MN), and further pairs (pair_FP) based on positional relationship in the original space, and optimize a low-dimensional embedding accordingly. Described in Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). "Understanding how dimension reduction tools work: an empirical approach to deciphering t-SNE, UMAP, TriMAP, and PaCMAP for data visualization." Journal of Machine Learning Research, 22(201), 1-73. This implementation is based on the work of Hao Zhang, as found in https://github.com/zhanghao-njmu/SCP/. We made modifications to ensure compatibility across multiple platforms, including Windows and macOS. } \examples{ pancreas_sub <- Seurat::FindVariableFeatures(pancreas_sub) pancreas_sub <- RunPaCMAP(object = pancreas_sub, features = Seurat::VariableFeatures(pancreas_sub)) DimPlot(pancreas_sub, reduction = "pacmap") } \references{ Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). "Understanding how dimension reduction tools work: an empirical approach to deciphering t-SNE, UMAP, TriMAP, and PaCMAP for data visualization." Journal of Machine Learning Research, 22(201), 1-73. } \author{ Yiyang Sun, Haiyang Huang, Gaurav Rajesh Parikh } ================================================ FILE: man/RunPresto.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/presto.R \name{RunPresto} \alias{RunPresto} \title{A Presto-based implementation of FindMarkers that runs Wilcoxon tests for the given identity classes} \usage{ RunPresto( object, ident.1 = NULL, ident.2 = NULL, group.by = NULL, subset.ident = NULL, assay = NULL, slot = "data", reduction = NULL, features = NULL, logfc.threshold = 0.25, test.use = "wilcox", min.pct = 0.1, min.diff.pct = -Inf, verbose = TRUE, only.pos = FALSE, max.cells.per.ident = Inf, random.seed = 1, latent.vars = NULL, min.cells.feature = 3, min.cells.group = 3, mean.fxn = NULL, fc.name = NULL, base = 2, ... ) } \arguments{ \item{ident.1}{Identity class to define markers for; pass an object of class \code{phylo} or 'clustertree' to find markers for a node in a cluster tree; passing 'clustertree' requires \code{\link{BuildClusterTree}} to have been run} \item{ident.2}{A second identity class for comparison; if \code{NULL}, use all other cells for comparison; if an object of class \code{phylo} or 'clustertree' is passed to \code{ident.1}, must pass a node to find markers for} \item{group.by}{Regroup cells into a different identity class prior to performing differential expression (see example)} \item{subset.ident}{Subset a particular identity class prior to regrouping. Only relevant if group.by is set (see example)} \item{assay}{Assay to use in differential expression testing} \item{slot}{Slot to pull data from; note that if \code{test.use} is "negbinom", "poisson", or "DESeq2", \code{slot} will be set to "counts"} \item{reduction}{Reduction to use in differential expression testing - will test for DE on cell embeddings} \item{mean.fxn}{Function to use for fold change or average difference calculation. If NULL, the appropriate function will be chose according to the slot used} \item{fc.name}{Name of the fold change, average difference, or custom function column in the output data.frame. If NULL, the fold change column will be named according to the logarithm base (eg, "avg_log2FC"), or if using the scale.data slot "avg_diff".} \item{base}{The base with respect to which logarithms are computed.} } \description{ A Presto-based implementation of FindMarkers that runs Wilcoxon tests for the given identity classes } \seealso{ https://github.com/immunogenomics/presto } ================================================ FILE: man/RunPrestoAll.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/presto.R \name{RunPrestoAll} \alias{RunPrestoAll} \alias{RunPrestoAllNode} \title{A Presto-based implementation of FindAllMarkers that runs Wilcoxon tests for all identity classes} \usage{ RunPrestoAll( object, assay = NULL, features = NULL, logfc.threshold = 0.25, test.use = "wilcox", slot = "data", min.pct = 0.1, min.diff.pct = -Inf, node = NULL, verbose = TRUE, only.pos = FALSE, max.cells.per.ident = Inf, random.seed = 1, latent.vars = NULL, min.cells.feature = 3, min.cells.group = 3, mean.fxn = NULL, fc.name = NULL, base = 2, return.thresh = 0.01, ... ) } \arguments{ \item{assay}{Assay to use in differential expression testing} \item{slot}{Slot to pull data from; note that if \code{test.use} is "negbinom", "poisson", or "DESeq2", \code{slot} will be set to "counts"} \item{node}{A node to find markers for and all its children; requires \code{\link{BuildClusterTree}} to have been run previously; replaces \code{FindAllMarkersNode}} \item{mean.fxn}{Function to use for fold change or average difference calculation. If NULL, the appropriate function will be chose according to the slot used} \item{fc.name}{Name of the fold change, average difference, or custom function column in the output data.frame. If NULL, the fold change column will be named according to the logarithm base (eg, "avg_log2FC"), or if using the scale.data slot "avg_diff".} \item{base}{The base with respect to which logarithms are computed.} \item{return.thresh}{Only return markers that have a p-value < return.thresh, or a power > return.thresh (if the test is ROC)} } \value{ Matrix containing a ranked list of putative markers, and associated statistics (p-values, logFC, etc.) } \description{ Finds markers (Wilcoxon-differentially expressed genes) for each of the identity classes in a dataset } \seealso{ https://github.com/immunogenomics/presto } ================================================ FILE: man/RunQuantileAlignSNF.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/liger.R \name{RunQuantileAlignSNF} \alias{RunQuantileAlignSNF} \alias{quantileAlignSNF} \title{Run quantileAlignSNF on a Seurat object} \usage{ RunQuantileAlignSNF( object, split.by = "orig.ident", reduction = "iNMF_raw", reduction.name = "iNMF", reduction.key = "iNMF_", recalc.snf = FALSE, ref_dataset = NULL, prune.thresh = 0.2, min_cells = 2, quantiles = 50, nstart = 10, resolution = 1, center = FALSE, id.number = NULL, print.mod = FALSE, print.align.summary = FALSE, ... ) } \arguments{ \item{object}{A merged Seurat object} \item{split.by}{Attribute for splitting, defaults to "orig.ident"} \item{reduction}{Name of reduction to use} \item{reduction.name}{Name to store resulting DimReduc object as} \item{reduction.key}{Key for resulting DimReduc} \item{recalc.snf}{Recalculate \code{\link{SNF}}} \item{ref_dataset}{Name of dataset to use as a "reference" for normalization. By default, the dataset with the largest number of cells is used.} \item{prune.thresh}{Minimum allowed edge weight. Any edges below this are removed (given weight 0) (default 0.2)} \item{min_cells}{Minimum number of cells to consider a cluster shared across datasets (default 2)} \item{quantiles}{Number of quantiles to use for quantile normalization (default 50).} \item{nstart}{Number of times to perform Louvain community detection with different random starts (default 10).} \item{resolution}{Controls the number of communities detected. Higher resolution -> more communities. (default 1)} \item{center}{Centers the data when scaling factors (useful for less sparse modalities like methylation data). (default FALSE)} \item{id.number}{Number to use for identifying edge file (when running in parallel) (generates random value by default).} \item{print.mod}{Print modularity output from clustering algorithm (default FALSE).} \item{print.align.summary}{Print summary of clusters which did not align normally (default FALSE).} \item{...}{Arguments passed to other methods, and to \code{\link[seurat.wrappers]{SNF}} if \code{recalc.snf = TRUE} or \code{\link[seurat.wrappers]{SNF}} hasn't been run} } \value{ A Seurat object with embeddings from \code{\link[liger]{quantileAlignSNF}} stored as a DimReduc object with name \code{reduction.name} (key set to \code{reduction.key}) } \description{ This is a deprecated function. Call 'RunQuantileNorm' instead. } \seealso{ \code{\link[rliger]{RunQuantileNorm}} } ================================================ FILE: man/RunQuantileNorm.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/liger.R \name{RunQuantileNorm} \alias{RunQuantileNorm} \alias{quantile_norm} \title{Run quantile_norm on a Seurat object} \usage{ RunQuantileNorm( object, split.by = "orig.ident", reduction = "iNMF_raw", reduction.name = "iNMF", reduction.key = "iNMF_", quantiles = 50, ref_dataset = NULL, min_cells = 20, knn_k = 20, dims.use = NULL, do.center = FALSE, max_sample = 1000, eps = 0.9, refine.knn = TRUE, ... ) } \arguments{ \item{object}{A merged Seurat object} \item{split.by}{Attribute for splitting, defaults to "orig.ident"} \item{reduction.name}{Name to store resulting DimReduc object as} \item{reduction.key}{Key for resulting DimReduc} \item{quantiles}{Number of quantiles to use for quantile normalization (default 50).} \item{ref_dataset}{Name of dataset to use as a "reference" for normalization. By default, the dataset with the largest number of cells is used.} \item{min_cells}{Minimum number of cells to consider a cluster shared across datasets (default 20)} \item{knn_k}{Number of nearest neighbors for within-dataset knn graph (default 20).} \item{dims.use}{Indices of factors to use for shared nearest factor determination (default 1:ncol(H[[1]])).} \item{do.center}{Centers the data when scaling factors (useful for less sparse modalities like methylation data). (default FALSE)} \item{max_sample}{Maximum number of cells used for quantile normalization of each cluster and factor. (default 1000)} \item{eps}{The error bound of the nearest neighbor search. (default 0.9) Lower values give more accurate nearest neighbor graphs but take much longer to computer.} \item{refine.knn}{whether to increase robustness of cluster assignments using KNN graph.(default TRUE)} \item{...}{Arguments passed to other methods} } \value{ A Seurat object with embeddings from \code{\link[liger]{quantile_norm}} stored as a DimReduc object with name \code{reduction.name} (key set to \code{reduction.key}) } \description{ Run quantile_norm on a Seurat object } \seealso{ \code{\link[rliger]{quantile_norm}} } ================================================ FILE: man/RunSNF.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/liger.R \name{RunSNF} \alias{RunSNF} \alias{SNF} \title{Generate shared factor neighborhood graph} \usage{ RunSNF( object, split.by = "orig.ident", reduction = "iNMF_raw", dims.use = NULL, dist.use = "CR", center = FALSE, knn_k = 20, k2 = 500, small.clust.thresh = knn_k, ... ) } \arguments{ \item{object}{A merged Seurat object} \item{split.by}{Attribute for splitting, defaults to "orig.ident"} \item{reduction}{Name of reduction to use} \item{...}{Arguments passed to other methods} } \value{ A Seurat object with the SNF list stored in the \code{tool} slot, accessible with \code{\link[Seurat]{Tool}} } \description{ This is a deprecated function. Call 'RunQuantileNorm' instead. } \seealso{ \code{\link[rliger]{RunQuantileNorm}} \code{\link[Seurat]{Tool}} } ================================================ FILE: man/RunVelocity.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/velocity.R \name{RunVelocity} \alias{RunVelocity} \title{Run RNA Velocty} \usage{ RunVelocity( object, spliced = "spliced", unspliced = "unspliced", ambiguous = NULL, spliced.average = 0.2, unspliced.average = 0.05, reduction = "pca", group.by = "ident", cells = NULL, graph = NULL, ncores = 1, verbose = TRUE, ... ) } \arguments{ \item{object}{A \code{Seurat} object} \item{spliced}{Name of spliced assay} \item{unspliced}{Name of unspliced assay} \item{ambiguous}{Optional name of ambiguous assay} \item{spliced.average, unspliced.average}{Required minimum average expression count for the spliced and unspliced expression matrices} \item{reduction}{Name of reduction to use} \item{group.by}{Factor to group cells by} \item{cells}{Vector of cells to use; defaults to all cells (see \code{\link[velocyto.R]{gene.relative.velocity.estimates}:steady.state.cells})} \item{graph}{Optional name of nearest neighbor graph to use} \item{ncores}{Number of cores to use} \item{verbose}{Display progress updates} \item{...}{Extra parameters passed to \code{\link[velocyto.R]{gene.relative.velocity.estimates}}} } \value{ ... } \description{ Run RNA Velocty } \seealso{ \code{\link[velocyto.R]{gene.relative.velocity.estimates}} \code{\link[Seurat]{Tool}} } ================================================ FILE: man/Runtricycle.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tricycle.R \name{Runtricycle} \alias{Runtricycle} \title{Run estimate_cycle_position on a Seurat object} \usage{ Runtricycle( object, assay = NULL, slot = "data", reduction.name = "tricycleEmbedding", reduction.key = "tricycleEmbedding_", gname = NULL, gname.type = c("ENSEMBL", "SYMBOL"), species = c("mouse", "human"), AnnotationDb = NULL, center.pc1 = 0, center.pc2 = 0 ) } \arguments{ \item{object}{Seurat object} \item{assay}{Assay to use, defaults to the default assay} \item{slot}{Slot to use. It should be library size adjusted **log-expression** values. Note that it is convention that we rename "logcounts" to "data" when converting SingleCellExperiment to Seurat object. See also \code{\link[Seurat]{as.Seurat}}. Defaults to "data"} \item{reduction.name}{Name of the cell cycle projection returned} \item{reduction.key}{Key for the cell cycle projection returned} \item{gname}{Alternative rownames of \code{object}. If provided, this will be used to map genes within \code{object} with genes in reference. If not provided, the rownames of \code{object} will be used instead. Default: NULL} \item{gname.type}{The type of gene names as in \code{gname} or rownames of \code{object}. It can be either 'ENSEMBL' or 'SYMBOL'. Default: 'ENSEMBL'} \item{species}{The type of species in \code{object}. It can be either 'mouse' or 'human'. Default: 'mouse'} \item{AnnotationDb}{An AnnotationDb objects. If the user provides rownames in the format of Ensembl IDs and project human data, this object will be used to map Ensembl IDs to gene SYMBOLs. If no AnnotationDb object being given, the function will use \code{\link[org.Hs.eg.db]{org.Hs.eg.db}}.} \item{center.pc1}{The center of PC1 when defining the angle. Default: 0} \item{center.pc2}{The center of PC2 when defining the angle. Default: 0} } \description{ This function run estimate_cycle_position function on Seurat object. It uses the tricycle internal reference projection matrix. } ================================================ FILE: man/SeuratWrappers-package.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/internal.R \docType{package} \name{SeuratWrappers-package} \alias{SeuratWrappers} \alias{SeuratWrappers-package} \title{SeuratWrappers: Community-Provided Methods and Extensions for the Seurat Object} \description{ SeuratWrappers is a collection of community-provided methods and extensions for Seurat, curated by the Satija Lab at NYGC. These methods comprise functionality not presently found in Seurat, and are able to be updated much more frequently. } \author{ \strong{Maintainer}: Paul Hoffman \email{nygcSatijalab@nygenome.org} (\href{https://orcid.org/0000-0002-7693-8957}{ORCID}) Authors: \itemize{ \item Andrew Butler \email{abutler@nygenome.org} (\href{https://orcid.org/0000-0003-3608-0463}{ORCID}) \item Rahul Satija \email{rsatija@nygenome.org} (\href{https://orcid.org/0000-0001-9448-8833}{ORCID}) \item Tim Stuart \email{tstuart@nygenome.org} (\href{https://orcid.org/0000-0002-3044-0897}{ORCID}) } Other contributors: \itemize{ \item Saket Choudhary \email{schoudhary@nygenome.org} (\href{https://orcid.org/0000-0001-5202-7633}{ORCID}) [contributor] \item David Collins \email{dcollins@nygenome.org} (\href{https://orcid.org/0000-0001-9243-7821}{ORCID}) [contributor] \item Yuhan Hao \email{yhao@nygenome.org} (\href{https://orcid.org/0000-0002-1810-0822}{ORCID}) [contributor] \item Austin Hartman \email{ahartman@nygenome.org} (\href{https://orcid.org/0000-0001-7278-1852}{ORCID}) [contributor] \item Gesmira Molla \email{gmolla@nygenome.org} (\href{https://orcid.org/0000-0002-8628-5056}{ORCID}) [contributor] } } ================================================ FILE: man/StopCellbrowser.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/cellbrowser.R \name{StopCellbrowser} \alias{StopCellbrowser} \title{Stop Cellbrowser web server} \usage{ StopCellbrowser() } \description{ Stop Cellbrowser web server } \examples{ \dontrun{ StopCellbrowser() } } ================================================ FILE: man/VeloPlot.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/velocity.R \name{VeloPlot} \alias{VeloPlot} \title{RNA Velocity Plot} \usage{ VeloPlot(object, reduction = NULL, ...) } \arguments{ \item{...}{Extra parameters passed on to \code{\link[velocyto.R]{show.velocity.on.embedding.cor}}} } \value{ Nothing, shows plot } \description{ RNA Velocity Plot } \seealso{ \code{\link[velocyto.R]{show.velocity.on.embedding.cor}} } \keyword{internal} ================================================ FILE: man/as.Seurat.extras.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/conos.R, R/monocle3.R, R/velocity.R \name{as.Seurat} \alias{as.Seurat} \alias{as.Seurat.Conos} \alias{as.Seurat.cell_data_set} \alias{as.Seurat.list} \title{Extra conversions to Seurat objects} \usage{ \method{as.Seurat}{Conos}( x, method = "mnn", reduction = "largeVis", idents = names(x = x$clusters)[1], verbose = TRUE, ... ) \method{as.Seurat}{cell_data_set}( x, counts = "counts", data = NULL, assay = "RNA", project = "cell_data_set", loadings = NULL, clusters = NULL, ... ) \method{as.Seurat}{list}( x, default.assay = 1, slot = "counts", min.cells = 0, min.features = 0, verbose = TRUE, ... ) } \arguments{ \item{method}{Name of matching method graph was built using} \item{reduction}{Name of graph embedding, if calculated} \item{idents}{Name of clutering method to set as identity class} \item{loadings}{Name of dimensional reduction to save loadings to, if present; defaults to first dimensional reduction present (eg. \code{SingleCellExperiment::reducedDimNames(x)[1]}); pass \code{NA} to suppress transfer of loadings} \item{clusters}{Name of clustering method to use for setting identity classes} \item{default.assay}{Name or index of matrix to use as default assay; defaults to name of first matrix in list} \item{slot}{Name of slot to store matrix in; choose from 'counts' or 'data'} } \description{ Extra conversions to Seurat objects } \details{ The \code{Conos} method for \code{\link[Seurat]{as.Seurat}} only works if all samples are \code{Seurat} objects. The object is initially constructed by merging all samples together using \code{\link[Seurat]{merge}}, any sample-level dimensional reductions and graphs will be lost during the merge. Extra information is added to the resulting Seurat object as follows: \itemize{ \item Pairwise alignments will be stored in miscellaneous data, as will any other miscellaneous information \item If a graph is present in the \code{graph} field, it will be stored as a \code{Graph} object, reordered to match cell order in the new \code{Seurat} object. It will be named "\code{DefaultAssay(SeuratObject)}_\code{method}" \item If an embedding is present in the \code{embedding} field as a \code{\link{matrix}}, it will be stored as a \code{DimReduc} object with the name \code{reduction} and a key value of "\code{toupper(reduction)}_" \item If the length of the \code{clusters} field is greater than zero, clustering information (\code{groups} field) will be added to object metadata. Extra information (\code{result} field) will be added to miscellaneous data with the name "conos.\code{clustering}.result" \item If present, the first clustering entry in the \code{clusters} field will be set as object identity classes } The \code{cell_data_set} method for \code{\link[Seurat]{as.Seurat}} utilizes the \code{\link[Seurat::as.Seurat]{SingleCellExperiment}} method of \code{\link[Seurat]{as.Seurat}} to handle moving over expression data, cell embeddings, and cell-level metadata. The following additional information will also be transfered over: \itemize{ \item Feature loadings from \code{cds@reduce_dim_aux$gene_loadings} will be added to the dimensional reduction specified by \code{loadings} or the name of the first dimensional reduction that contains "pca" (case-insensitive) if \code{loadings} is not set \item Monocle 3 clustering will be set as the default identity class. In addition, the Monocle 3 clustering will be added to cell-level metadata as \dQuote{monocle3_clusters}, if present \item Monocle 3 partitions will be added to cell-level metadata as \dQuote{monocle3_partitions}, if present \item Monocle 3 pseudotime calculations will be added to \dQuote{monocle3_pseudotime}, if present \item The nearest-neighbor graph, if present, will be converted to a \code{\link[Seurat]{Graph}} object, and stored as \dQuote{\code{assay}_monocle3_graph} } The \code{list} method for \code{\link[Seurat]{as.Seurat}} takes a named list of matrices (dense or sparse) and creates a single \code{Seurat} object where each matrix is its own assay. The names of the list are taken to be the names of the assays. If not present, assays will be named as "Assay#" where "#" is the index number in the list of matrices. Objects will be constructed as follows: \itemize{ \item By default, all matrices are assumed to be raw counts and will be stored in the \code{counts} slot. This can be changed to store in the matrix in the \code{data} slot instead. The \code{slot} parameter is vectorized, so different matrices can be stored in either \code{counts} or \code{data} \item For any and all matrices designated as \code{counts}, the \code{min.cells} and \code{min.features} filtering will be applied. These parameters are vectorized, so different filterings can be applied to different matrices \item No extra information (eg. \code{project}) can be provided to \code{\link[Seurat]{CreateSeuratObject}} } } \seealso{ \code{\link[Seurat]{as.Seurat}} \code{\link[Seurat]{as.Seurat.SingleCellExperiment}} } ================================================ FILE: man/as.cell_data_set.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/monocle3.R \name{as.cell_data_set} \alias{as.cell_data_set} \alias{as.CellDataSet} \alias{as.cell_data_set.Seurat} \title{Convert objects to Monocle3 \code{cell_data_set} objects} \usage{ as.cell_data_set(x, ...) \method{as.cell_data_set}{Seurat}( x, assay = DefaultAssay(object = x), reductions = AssociatedDimReducs(object = x, assay = assay), default.reduction = DefaultDimReduc(object = x, assay = assay), graph = paste0(assay, "_snn"), group.by = NULL, ... ) } \arguments{ \item{x}{An object} \item{...}{Arguments passed to other methods} \item{reductions}{A vector of dimensional reductions add to the \code{cell_data_set} object; defaults to all dimensional reductions calculated from \code{assay} and all \link[Seurat:IsGlobal]{global} dimensional reductions} \item{default.reduction}{Name of dimensional reduction to use for clustering name} \item{graph}{Name of graph to be used for clustering results} \item{group.by}{Name of cell-level metadata column to use as identites; pass} } \value{ A \code{cell_data_set} object } \description{ Convert objects to Monocle3 \code{cell_data_set} objects } \details{ The \code{\link[Seurat]{Seurat}} method utilizes \code{\link[Seurat]{as.SingleCellExperiment}} to transfer over expression and cell-level metadata. The following additional information is also transferred over: \itemize{ \item Cell emebeddings are transferred over to the \code{\link[SingleCellExperiment]{reducedDims}} slot. Dimensional reduction names are converted to upper-case (eg. \dQuote{umap} to \dQuote{UMAP}) to match Monocle 3 style \item Feature loadings are transfered to \code{cds@reduce_dim_aux$gene_loadings} if present. \strong{NOTE}: only the feature loadings of the last dimensional reduction are transferred over \item Standard deviations are added to \code{cds@reduce_dim_aux$prop_var_expl} if present. \strong{NOTE}: only the standard deviations of the last dimensional reduction are transferred over \item Clustering information is transferred over in the following manner: if cell-level metadata entries \dQuote{monocle3_clusters} and \dQuote{monocle3_partitions} exist, then these will be set as the clusters and partitions, with no nearest neighbor graph being added to the object; otherwise, Seurat's nearest-neighbor graph will be converted to an \code{\link[igraph]{igraph}} object and added to the \code{cell_data_set} object along with Seurat's clusters. No partition information is added when using Seurat's clsuters } } \seealso{ \code{\link[Seurat]{as.SingleCellExperiment}} } ================================================ FILE: man/findMatrix.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/cellbrowser.R \name{findMatrix} \alias{findMatrix} \title{used by ExportToCellbrowser: Return a matrix object from a Seurat object or show an error message} \usage{ findMatrix(object, matrix.slot) } \arguments{ \item{object}{Seurat object} \item{matrix.slot}{the name of the slot} } \description{ used by ExportToCellbrowser: Return a matrix object from a Seurat object or show an error message } ================================================ FILE: man/scVIIntegration.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/scVI.R \name{scVIIntegration} \alias{scVIIntegration} \title{scVI Integration} \usage{ scVIIntegration( object, features = NULL, layers = "counts", conda_env = NULL, new.reduction = "integrated.dr", ndims = 30, nlayers = 2, gene_likelihood = "nb", max_epochs = NULL, ... ) } \arguments{ \item{object}{A \code{StdAssay} or \code{STDAssay} instance containing merged data} \item{features}{Features to integrate} \item{layers}{Layers to integrate} \item{conda_env}{conda environment to run scVI} \item{new.reduction}{Name under which to store resulting DimReduc object} \item{ndims}{Dimensionality of the latent space} \item{nlayers}{Number of hidden layers used for encoder and decoder NNs} \item{gene_likelihood}{Distribution to use for modelling expression data: {"zinb", "nb", "poisson"}} \item{max_epochs}{Number of passes through the dataset taken while training the model} \item{...}{Unused - currently just capturing parameters passed in from \code{Seurat::IntegrateLayers} intended for other integration methods} } \value{ A single-element named list \code{DimReduc} elements containing the integrated data } \description{ scVI Integration } \note{ This function requires the \href{https://docs.scvi-tools.org/en/stable/installation.html}{\pkg{scvi-tools}} package to be installed } \examples{ \dontrun{ # Preprocessing obj <- SeuratData::LoadData("pbmcsca") obj[["RNA"]] <- split(obj[["RNA"]], f = obj$Method) obj <- NormalizeData(obj) obj <- FindVariableFeatures(obj) obj <- ScaleData(obj) obj <- RunPCA(obj) # After preprocessing, we integrate layers, specifying a conda environment obj <- IntegrateLayers( object = obj, method = scVIIntegration, new.reduction = "integrated.scvi", conda_env = "../miniconda3/envs/scvi-env", verbose = FALSE ) # Alternatively, we can integrate SCTransformed data obj <- SCTransform(object = obj) obj <- IntegrateLayers( object = obj, method = scVIIntegration, orig.reduction = "pca", new.reduction = "integrated.scvi", assay = "SCT", conda_env = "../miniconda3/envs/scvi-env", verbose = FALSE ) } } \seealso{ \href{https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scvi_in_R.html}{scVI} } ================================================ FILE: man/writeSparseTsvChunks.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/cellbrowser.R \name{writeSparseTsvChunks} \alias{writeSparseTsvChunks} \title{Used by \code{ExportToCellbrowser}: Write a big sparse matrix to a .tsv.gz file by writing chunks, concating them with the Unix cat command, then gziping the result. This does not work on Windows, we'd have to use the copy /b command there.} \usage{ writeSparseTsvChunks(inMat, outFname, sliceSize = 1000) } \arguments{ \item{inMat}{input matrix} \item{outFname}{output file name, has to end with .gz} \item{sliceSize=1000, }{size of each chunk in number of lines} } \value{ Invisibly returns \code{NULL} } \description{ Used by \code{ExportToCellbrowser}: Write a big sparse matrix to a .tsv.gz file by writing chunks, concating them with the Unix cat command, then gziping the result. This does not work on Windows, we'd have to use the copy /b command there. } \examples{ \dontrun{ writeSparseTsvChunks( pbmc_small@data, "exprMatrix.tsv.gz") } } ================================================ FILE: seurat-wrappers.Rproj ================================================ Version: 1.0 RestoreWorkspace: Default SaveWorkspace: Default AlwaysSaveHistory: Default EnableCodeIndexing: Yes UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 RnwWeave: Sweave LaTeX: pdfLaTeX AutoAppendNewline: Yes StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source PackageRoxygenize: rd,collate,namespace ================================================ FILE: test-vignettes.sh ================================================ #!/bin/bash # Check changed vignettes for SeuratWrappers # Set global options to cause the script to fail upon failure of any one step set -eo pipefail # A simple function to get the extension of files function extension() { local fname="${1}" # Name of file to get extension of echo "$(echo ${fname} | rev | cut -f 1 -d '.' | rev)" } export -f extension # Get the remote for Satija Lab SATIJA_BRANCH="$(Rscript -e "cat(Seurat:::RandomName())")" (set -x; git fetch https://github.com/satijalab/seurat-wrappers +master:"${SATIJA_BRANCH}") # Get the branch for this PR PR_HASH="$(set -x; git log -n1 --format=format:'%H')" # Get differences between files declare -a DIFFS=($(set -x; git diff --name-only "${PR_HASH}" "${SATIJA_BRANCH}")) # Figure out which files have corresponding Rmds declare -a DIFF_RMDS=() declare -a MISSING=() for DFILE in ${DIFFS[@]}; do case $(dirname ${DFILE}) in # Only certain files will be checked docs) # Ensure we're only checking changed vignettes; these will have an extension of rmd or Rmd $(echo $(extension ${DFILE}) | grep -iw rmd > /dev/null 2> /dev/null) || continue DIFF_RMDS+=("${DFILE}") ;; R) # If a source file has changed, ensure it corresponds to a vignette if [[ $(basename ${DFILE}) == 'internal.R' ]]; then # If internal.R has changed, throw a warning for manual checks, but don't check everything on Azure echo "WARNING: internal.R has changed, please check all vignettes" >&2 continue elif [[ $(echo $(extension ${DFILE}) | grep -iw r > /dev/null 2> /dev/null; echo "$?") -eq 0 ]]; then # If an R file has changed, ask if it has a vignette # If so, add the vignette to the list of vignettes to check # If not, put it in list of source files missing a vignette BNAME="$(basename ${DFILE} .$(extension ${DFILE}))" DVIGNETTE=$(find docs -maxdepth 1 -iregex "^docs/${BNAME}\.rmd") [[ "${#DVIGNETTE}" -eq 0 ]] && MISSING+=("${DFILE}") || DIFF_RMDS+=("${DVIGNETTE}") else # Non-R files shouldn't be here continue fi ;; *) # All other files are not checked continue ;; esac done # Do we have vignettes to check if [[ ${#DIFF_RMDS[@]} -eq 0 && ${#MISSING[@]} -gt 0 ]]; then # No, but source files changed, throw an error echo "ERROR: Missing vignettes for all changed source files" >&2 exit 1 elif [[ ${#MISSING[@]} -gt 0 ]]; then # Yes, but some source files changed and we couldn't find a vignette echo -e "WARNING: Missing vignettes for the following source files:" >&2 for MV in ${MISSING[@]}; do echo -e "\t${MV}" >&2; done elif [[ ${#DIFF_RMDS[@]} -eq 0 ]]; then # No, and no source files changed echo "No changed vignettes" >&2 exit 0 else : fi # Store new vignettes in test-build dir mkdir test-build # Filter our changed vignettes list to only unique vignettes declare -a UNIQ_DIFFS=($(echo ${DIFF_RMDS[@]} | tr ' ' '\n' | sort | uniq)) for I in $(seq 1 ${#UNIQ_DIFFS[@]}); do TFILE="${UNIQ_DIFFS[$((${I} - 1))]}" echo "Testing vignette ${TFILE} (vignette ${I} of ${#UNIQ_DIFFS[@]})" >&2 (set -x; Rscript -e "rmarkdown::render('${TFILE}', output_format = 'all', output_dir = 'test-build')") done