Repository: YuLab-SMU/ChIPseeker Branch: devel Commit: c1f4a507f226 Files: 119 Total size: 433.4 KB Directory structure: gitextract_66qccjqf/ ├── .Rbuildignore ├── .github/ │ └── issue_template.md ├── .gitignore ├── .svnignore ├── .travis.yml ├── CONDUCT.md ├── DESCRIPTION ├── GEODATA ├── Makefile ├── NAMESPACE ├── NEWS ├── NEWS.md ├── R/ │ ├── AllGenerics.R │ ├── ChIPseeker-package.R │ ├── GEO.R │ ├── addGeneAnno.R │ ├── annotatePeak.R │ ├── covplot.R │ ├── csAnno.R │ ├── dplyr-verb.R │ ├── enrichOverlap.R │ ├── getFlankingGene.R │ ├── getGenomicAnnotation.R │ ├── getNearestFeatureIndicesAndDistances.R │ ├── plotAnno.R │ ├── plotDistToTSS.R │ ├── plotTagMatrix.R │ ├── readPeakFile.R │ ├── seq2gene.R │ ├── subset.R │ ├── tagMatrix.R │ ├── upsetplot.R │ ├── utilities.R │ ├── vennpie.R │ ├── vennplot.R │ └── zzz.R ├── README.Rmd ├── README.md ├── appveyor.yml ├── data/ │ ├── gsminfo.rda │ ├── tagMatrixList.rda │ └── ucsc_release.rda ├── inst/ │ ├── CITATION │ ├── extdata/ │ │ ├── processedGSM.rda │ │ └── sample_peaks.txt │ └── test-plot/ │ ├── test-plotPeakProf.R │ └── test-plotTagMatrix.R ├── man/ │ ├── ChIPseeker-package.Rd │ ├── ChIPseekerCache.Rd │ ├── annotatePeak.Rd │ ├── as.GRanges.Rd │ ├── as.data.frame.csAnno.Rd │ ├── check_upstream_and_downstream.Rd │ ├── combine_csAnno.Rd │ ├── covplot.Rd │ ├── csAnno-class.Rd │ ├── dot-ChIPseekerEnv.Rd │ ├── dotFun.Rd │ ├── downloadGEObedFiles.Rd │ ├── downloadGSMbedFiles.Rd │ ├── dropAnno.Rd │ ├── enrichAnnoOverlap.Rd │ ├── enrichPeakOverlap.Rd │ ├── getAnnoStat.Rd │ ├── getBioRegion.Rd │ ├── getGEOInfo.Rd │ ├── getGEOgenomeVersion.Rd │ ├── getGEOspecies.Rd │ ├── getGeneAnno.Rd │ ├── getGenomicAnnotation.Rd │ ├── getNearestFeatureIndicesAndDistances.Rd │ ├── getPromoters.Rd │ ├── getSampleFiles.Rd │ ├── getTagMatrix.Rd │ ├── getTagMatrix.binning.internal.Rd │ ├── getTagMatrix.internal.Rd │ ├── getTagMatrix2.Rd │ ├── getTagMatrix2.binning.internal.Rd │ ├── getTagMatrix2.internal.Rd │ ├── info.Rd │ ├── makeBioRegionFromGranges.Rd │ ├── make_label.Rd │ ├── overlap.Rd │ ├── peakHeatmap.Rd │ ├── peakHeatmap_multiple_Sets.Rd │ ├── peak_Profile_Heatmap.Rd │ ├── plotAnnoBar-methods.Rd │ ├── plotAnnoBar.Rd │ ├── plotAnnoPie-methods.Rd │ ├── plotAnnoPie.Rd │ ├── plotAvgProf.Rd │ ├── plotAvgProf.binning.Rd │ ├── plotAvgProf2.Rd │ ├── plotDistToTSS-methods.Rd │ ├── plotDistToTSS.data.frame.Rd │ ├── plotMultiProf.Rd │ ├── plotMultiProf.binning.Rd │ ├── plotMultiProf.binning.internal.Rd │ ├── plotMultiProf.normal.Rd │ ├── plotMultiProf.normal.internal.Rd │ ├── plotPeakProf.Rd │ ├── plotPeakProf2.Rd │ ├── plotPeakProf_MultiWindows.Rd │ ├── readPeakFile.Rd │ ├── reexports.Rd │ ├── seq2gene.Rd │ ├── show-methods.Rd │ ├── shuffle.Rd │ ├── tagHeatmap.Rd │ ├── upsetplot-methods.Rd │ ├── vennpie-methods.Rd │ ├── vennplot.Rd │ └── vennplot.peakfile.Rd ├── tests/ │ ├── testthat/ │ │ ├── test-bed.R │ │ ├── test-getTagMatrix.R │ │ └── test-txdb.R │ └── testthat.R └── vignettes/ ├── ChIPseeker.Rmd └── ChIPseeker.bib ================================================ FILE CONTENTS ================================================ ================================================ FILE: .Rbuildignore ================================================ .travis.yml .svnignore .gitignore ^.*\.DS_Store Makefile README.Rmd appveyor.yml GEODATA docs mkdocs .github ^CONDUCT\.md$ ================================================ FILE: .github/issue_template.md ================================================ ### Prerequisites + [ ] Have you read [Feedback](https://guangchuangyu.github.io/chipseeker/#feedback) and follow the [guide](https://guangchuangyu.github.io/2016/07/how-to-bug-author/)? * [ ] make sure your are using the latest release version * [ ] read the [documents](https://guangchuangyu.github.io/chipseeker/documentation/) * [ ] google your quesion/issue ### Describe you issue * [ ] Make a reproducible example (*e.g.* [1](https://gist.github.com/talonsensei/e1fad082657054207f249ec98f0920eb)) * [ ] your code should contain comments to describe the problem (*e.g.* what expected and actually happened?) ### Ask in right place * [ ] for bugs or feature requests, post here (github issue) * [ ] for questions, please post to [Bioconductor](https://support.bioconductor.org/) or [Biostars](https://www.biostars.org/) with tag `ChIPseeker` ================================================ FILE: .gitignore ================================================ .DS_Store data/.DS_Store inst/.DS_Store inst/extdata/.DS_Store inst/extdata/GEO_sample_data/.DS_Store .svn *~ docs/__init__.py docs/__pycache__ __init__.pyc ================================================ FILE: .svnignore ================================================ .git *.Rhistory .travis.yml appveyor.yml docs mkdocs .github ================================================ FILE: .travis.yml ================================================ ## reference: http://docs.travis-ci.com/user/languages/r/ language: r r: bioc-devel cache: packages bioc_required: true bioc_use_devel: true os: - linux - osx env: global: - _R_CHECK_FORCE_SUGGESTS_=False - R_LIBS="http://cran.rstudio.com" r_packages: - knitr - rmarkdown bioc_packages: - DO.db - DOSE - graphite - ReactomePA - org.Hs.eg.db - TxDb.Hsapiens.UCSC.hg19.knownGene - GenomicRanges - GenomicFeatures after_failure: - ./travis-tool.sh dump_logs r_github_packages: - jimhester/covr after_success: - Rscript -e 'library(covr); codecov()' notifications: email: recipients: gcyu@connect.hku.hk on_success: never on_failure: always ================================================ FILE: CONDUCT.md ================================================ # Contributor Code of Conduct As contributors and maintainers of this project, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities. We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. Examples of unacceptable behavior by participants include the use of sexual language or imagery, derogatory comments or personal attacks, trolling, public or private harassment, insults, or other unprofessional conduct. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed from the project team. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers. This Code of Conduct is adapted from the Contributor Covenant (http:contributor-covenant.org), version 1.0.0, available at http://contributor-covenant.org/version/1/0/0/ ================================================ FILE: DESCRIPTION ================================================ Package: ChIPseeker Type: Package Title: ChIPseeker for ChIP peak Annotation, Comparison, and Visualization Version: 1.49.0 Authors@R: c( person(given = "Guangchuang", family = "Yu", email = "guangchuangyu@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6485-8781")), person(given = "Ming", family = "Li", email = "limiang929@gmail.com", role = "ctb"), person(given = "Qianwen", family = "Wang", email = "treywea@gmail.com", role = "ctb"), person(given = "Yun", family = "Yan", email = "youryanyun@gmail.com", role = "ctb"), person(given = "Hervé", family = "Pagès", email = "hpages.on.github@gmail.com", role = "ctb"), person(given = "Michael", family = "Kluge", email = "michael.kluge@bio.ifi.lmu.de", role = "ctb"), person(given = "Thomas", family = "Schwarzl", email = "schwarzl@embl.de", role = "ctb"), person(given = "Zhougeng", family = "Xu", email = "xuzhougeng@163.com", role = "ctb"), person(given = "Chun-Hui", family = "Gao", email="gaospecial@gmail.com", role = "ctb", comment=c(ORCID = "0000-0002-1445-7939")) ) Maintainer: Guangchuang Yu Description: This package implements functions to retrieve the nearest genes around the peak, annotate genomic region of the peak, statstical methods for estimate the significance of overlap among ChIP peak data sets, and incorporate GEO database for user to compare the own dataset with those deposited in database. The comparison can be used to infer cooperative regulation and thus can be used to generate hypotheses. Several visualization functions are implemented to summarize the coverage of the peak experiment, average profile and heatmap of peaks binding to TSS regions, genomic annotation, distance to TSS, and overlap of peaks or genes. Depends: R (>= 3.5.0) Imports: AnnotationDbi, aplot, BiocGenerics, boot, dplyr, enrichplot, IRanges, GenomeInfoDb, GenomicRanges, GenomicFeatures, ggplot2, gplots, graphics, grDevices, gtools, magrittr, methods, plotrix, parallel, RColorBrewer, rlang, rtracklayer, S4Vectors, scales, stats, tibble, TxDb.Hsapiens.UCSC.hg19.knownGene, utils, yulab.utils (>= 0.2.0) Suggests: clusterProfiler, ggimage, ggplotify, ggupset, ggVennDiagram, knitr, org.Hs.eg.db, prettydoc, ReactomePA, rmarkdown, testthat, TxDb.Hsapiens.UCSC.hg38.knownGene Remotes: GuangchuangYu/enrichplot URL: https://yulab-smu.top/contribution-knowledge-mining/ BugReports: https://github.com/YuLab-SMU/ChIPseeker/issues Encoding: UTF-8 VignetteBuilder: knitr ByteCompile: true License: Artistic-2.0 biocViews: Annotation, ChIPSeq, Software, Visualization, MultipleComparison RoxygenNote: 7.3.3 ================================================ FILE: GEODATA ================================================ UPDATE OF GEO DATA + 20947 bed file information in ChIPseeker (version >=1.9.8) <2016-09-20, Tue> + 19348 bed file information in ChIPseeker (version >= 1.7.15) <2016-03-21, Mon> + 18813 bed file information in ChIPseeker (version >= 1.5.11, BioC 3.2 devel) <2015-09-24, Thu> + >17,726 bed file information in ChIPseeker (version>=1.4.0, BioC 3.1) + >15,000 bed file information ChIPseeker (version>=1.2.0, BioC 3.0) ================================================ FILE: Makefile ================================================ PKGNAME := $(shell sed -n "s/Package: *\([^ ]*\)/\1/p" DESCRIPTION) PKGVERS := $(shell sed -n "s/Version: *\([^ ]*\)/\1/p" DESCRIPTION) PKGSRC := $(shell basename `pwd`) BIOCVER := RELEASE_3_23 all: rd check clean alldocs: rd readme rd: Rscript -e 'roxygen2::roxygenise(".")' readme: Rscript -e 'rmarkdown::render("README.Rmd", encoding="UTF-8")' build: #cd ..;\ #R CMD build $(PKGSRC) Rscript -e 'devtools::build()' build2: cd ..;\ R CMD build --no-build-vignettes $(PKGSRC) install: cd ..;\ R CMD INSTALL $(PKGNAME)_$(PKGVERS).tar.gz check: Rscript -e 'devtools::check()' ## cd ..;\ ## Rscript -e 'rcmdcheck::rcmdcheck("$(PKGNAME)_$(PKGVERS).tar.gz")' check2: build cd ..;\ R CMD check $(PKGNAME)_$(PKGVERS).tar.gz bioccheck: cd ..;\ Rscript -e 'BiocCheck::BiocCheck("$(PKGNAME)_$(PKGVERS).tar.gz")' clean: cd ..;\ $(RM) -r $(PKGNAME).Rcheck/ gitmaintain: git gc --auto;\ git prune -v;\ git fsck --full rmrelease: git branch -D $(BIOCVER) release: git checkout $(BIOCVER);\ git fetch --all update: git fetch --all;\ git checkout devel;\ git merge upstream/devel;\ git merge origin/devel biocinit: git remote add upstream git@git.bioconductor.org:packages/$(PKGNAME).git;\ git fetch --all push: git push upstream devel;\ git push origin devel ================================================ FILE: NAMESPACE ================================================ # Generated by roxygen2: do not edit by hand S3method(arrange,GRanges) S3method(as.data.frame,csAnno) S3method(filter,GRanges) S3method(mutate,GRanges) S3method(rename,GRanges) S3method(subset,csAnno) export(.) export(GRangesList) export(annotatePeak) export(as.GRanges) export(combine_csAnno) export(covplot) export(downloadGEObedFiles) export(downloadGSMbedFiles) export(dropAnno) export(enrichAnnoOverlap) export(enrichPeakOverlap) export(getAnnoStat) export(getBioRegion) export(getGEOInfo) export(getGEOgenomeVersion) export(getGEOspecies) export(getPromoters) export(getSampleFiles) export(getTagMatrix) export(makeBioRegionFromGranges) export(overlap) export(peakHeatmap) export(peakHeatmap_multiple_Sets) export(peak_Profile_Heatmap) export(plotAnnoBar) export(plotAnnoPie) export(plotAnnoPie.csAnno) export(plotAvgProf) export(plotAvgProf2) export(plotDistToTSS) export(plotPeakProf) export(plotPeakProf2) export(readPeakFile) export(rel) export(seq2gene) export(shuffle) export(tagHeatmap) export(vennpie) export(vennplot) export(vennplot.peakfile) exportClasses(csAnno) exportMethods(plotAnnoBar) exportMethods(plotAnnoPie) exportMethods(plotDistToTSS) exportMethods(show) exportMethods(upsetplot) exportMethods(vennpie) import(BiocGenerics) import(GenomeInfoDb) import(GenomicRanges) import(IRanges) import(S4Vectors) importFrom(AnnotationDbi,get) importFrom(AnnotationDbi,select) importFrom(BiocGenerics,end) importFrom(BiocGenerics,start) importFrom(GenomicFeatures,exonsBy) importFrom(GenomicFeatures,fiveUTRsByTranscript) importFrom(GenomicFeatures,genes) importFrom(GenomicFeatures,intronsByTranscript) importFrom(GenomicFeatures,threeUTRsByTranscript) importFrom(GenomicFeatures,transcripts) importFrom(GenomicFeatures,transcriptsBy) importFrom(GenomicRanges,GRangesList) importFrom(S4Vectors,metadata) importFrom(S4Vectors,subset) importFrom(TxDb.Hsapiens.UCSC.hg19.knownGene,TxDb.Hsapiens.UCSC.hg19.knownGene) importFrom(aplot,insert_bottom) importFrom(aplot,plot_list) importFrom(boot,boot) importFrom(boot,boot.ci) importFrom(dplyr,arrange) importFrom(dplyr,filter) importFrom(dplyr,group_by) importFrom(dplyr,mutate) importFrom(dplyr,summarise) importFrom(enrichplot,upsetplot) importFrom(ggplot2,aes) importFrom(ggplot2,aes_) importFrom(ggplot2,aes_string) importFrom(ggplot2,coord_fixed) importFrom(ggplot2,coord_flip) importFrom(ggplot2,element_blank) importFrom(ggplot2,element_text) importFrom(ggplot2,facet_grid) importFrom(ggplot2,geom_bar) importFrom(ggplot2,geom_blank) importFrom(ggplot2,geom_hline) importFrom(ggplot2,geom_line) importFrom(ggplot2,geom_rect) importFrom(ggplot2,geom_ribbon) importFrom(ggplot2,geom_segment) importFrom(ggplot2,geom_text) importFrom(ggplot2,geom_tile) importFrom(ggplot2,geom_vline) importFrom(ggplot2,ggplot) importFrom(ggplot2,ggtitle) importFrom(ggplot2,guide_legend) importFrom(ggplot2,labs) importFrom(ggplot2,rel) importFrom(ggplot2,scale_color_manual) importFrom(ggplot2,scale_fill_brewer) importFrom(ggplot2,scale_fill_distiller) importFrom(ggplot2,scale_fill_hue) importFrom(ggplot2,scale_fill_manual) importFrom(ggplot2,scale_x_continuous) importFrom(ggplot2,scale_y_continuous) importFrom(ggplot2,theme) importFrom(ggplot2,theme_bw) importFrom(ggplot2,theme_classic) importFrom(ggplot2,theme_minimal) importFrom(ggplot2,xlab) importFrom(ggplot2,xlim) importFrom(ggplot2,ylab) importFrom(gplots,plot.venn) importFrom(grDevices,colorRampPalette) importFrom(graphics,layout) importFrom(graphics,legend) importFrom(graphics,par) importFrom(graphics,pie) importFrom(graphics,plot.new) importFrom(gtools,permutations) importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") importFrom(methods,as) importFrom(methods,is) importFrom(methods,missingArg) importFrom(methods,new) importFrom(methods,show) importFrom(parallel,detectCores) importFrom(parallel,mclapply) importFrom(plotrix,floating.pie) importFrom(rlang,.data) importFrom(rlang,quos) importFrom(rtracklayer,import.chain) importFrom(rtracklayer,liftOver) importFrom(stats,p.adjust) importFrom(stats,phyper) importFrom(utils,data) importFrom(utils,download.file) importFrom(utils,read.delim) importFrom(utils,setTxtProgressBar) importFrom(utils,txtProgressBar) importFrom(yulab.utils,get_cache_element) importFrom(yulab.utils,get_cache_item) importFrom(yulab.utils,initial_cache_item) importFrom(yulab.utils,mat2df) importFrom(yulab.utils,rm_cache_item) importFrom(yulab.utils,update_cache_item) importFrom(yulab.utils,yulab_msg) ================================================ FILE: NEWS ================================================ CHANGES IN VERSION 1.15.2 ------------------------ o bug fixed for 'overlap = "all"' to consider strand information <2017-12-12, Tue> CHANGES IN VERSION 1.15.1 ------------------------ o define downstream distance via options(ChIPseeker.downstreamDistance = 3000) + https://support.bioconductor.org/p/103135/ CHANGES IN VERSION 1.13.1 ------------------------ o fixed issue of naming intronList <2017-07-06, Thu> + https://github.com/GuangchuangYu/ChIPseeker/issues/57#issuecomment-313342399 CHANGES IN VERSION 1.12.0 ------------------------ o BioC 3.5 release <2017-04-26, Wed> CHANGES IN VERSION 1.11.4 ------------------------ o bug fixed of intron rank <2017-04-19, Wed> + https://github.com/GuangchuangYu/ChIPseeker/issues/54 CHANGES IN VERSION 1.11.3 ------------------------ o bug fixed of dropAnno <2017-04-10, Mon> o bug fixed of peak width generated by shuffle <2017-03-31, Fri> + CHANGES IN VERSION 1.11.2 ------------------------ o optimize getGeneAnno <2016-12-21, Wed> o change plotAnnoBar and plotDistToTSS according to stacking bar order change in ggplot2 (v2.2.0) <2016-12-16, Fri> + https://github.com/GuangchuangYu/ChIPseeker/issues/47 + https://blog.rstudio.org/2016/11/14/ggplot2-2-2-0/ CHANGES IN VERSION 1.11.1 ------------------------ o update startup message <2016-11-09, Wed> CHANGES IN VERSION 1.10.0 ------------------------ o BioC 3.4 released <2016-10-18, Tue> CHANGES IN VERSION 1.9.8 ------------------------ o plotAvgProf/plotAvgProf2 order of panel by names of input tagMatrix List <2016-09-25, Sun> o test ENSEMBL ID using '^ENS' instead of '^ENSG' <2016-09-20, Tue> + https://github.com/GuangchuangYu/ChIPseeker/issues/41 CHANGES IN VERSION 1.9.7 ------------------------ o unit test <2016-08-16, Tue> CHANGES IN VERSION 1.9.6 ------------------------ o update vignette <2016-08-16, Tue> CHANGES IN VERSION 1.9.5 ------------------------ o when TxDb doesn't have gene_id information, converting gene ID (ensembl/entrez and symbol) will be omitted instead of throw error. <2016-08-02, Tue> + https://www.biostars.org/p/204142 o bug fixed if testing targetPeak is a list of GRanges objects in enrichPeakOverlap function <2016-07-20, Wed> + https://github.com/GuangchuangYu/ChIPseeker/issues/37 + https://github.com/GuangchuangYu/ChIPseeker/issues/36 o fixed typo in determine gene ID type <2016-06-21, Tue> + https://github.com/GuangchuangYu/ChIPseeker/issues/28#issuecomment-227212519 o move upsetplot generics to DOSE and import from DOSE to prevent function name conflict <2016-06-14, Tue> CHANGES IN VERSION 1.9.4 ------------------------ o bug fixed <2016-06-08, Wed> + https://github.com/GuangchuangYu/ChIPseeker/issues/17#issuecomment-224407402 + https://github.com/GuangchuangYu/ChIPseeker/pull/24/files CHANGES IN VERSION 1.9.3 ------------------------ o use byte compiler <2016-05-18, Wed> o https://github.com/Bioconductor-mirror/ChIPseeker/commit/f1ada57b9c66a1a44355bbbbdaf5b0a88e10cf7d CHANGES IN VERSION 1.9.2 ------------------------ o name tagMatrix in plotAvgProf automatically if missing <2016-05-12, Thu> o https://github.com/Bioconductor-mirror/ChIPseeker/commit/d5f16b2bc01725e30282c3acb33007ef521a514c CHANGES IN VERSION 1.9.1 ------------------------ o bug fixed in getNearestFeatureIndicesAndDistances <2016-05-11, Wed> + correct metadata in dummy NA feature CHANGES IN VERSION 1.8.0 ------------------------ o BioC 3.3 released <2016-05-05, Thu> CHANGES IN VERSION 1.7.15 ------------------------ o update GEO data <2016-03-21, Mon> CHANGES IN VERSION 1.7.14 ------------------------ o list_to_dataframe works with data frames that have different colnames <2016-03-10, Thu> CHANGES IN VERSION 1.7.13 ------------------------ o support annotate peaks with custom regions via passing TxDb=user_defined_GRanges to annotatePeak <2016-03-06, Sun> CHANGES IN VERSION 1.7.12 ------------------------ o fixed R check <2016-03-05, Sat> o implement list_to_dataframe that mimic ldply and remove ldply dependency <2016-03-05, Sat> CHANGES IN VERSION 1.7.11 ------------------------ o fixed issue in testing list in covplot introduced in 1.7.9 <2016-03-02, Wed> CHANGES IN VERSION 1.7.10 ------------------------ o determined gene ID type if TxDb doesn't contain corresponding metadata <2016-03-01, Tue> + fixed https://github.com/GuangchuangYu/ChIPseeker/issues/28 CHANGES IN VERSION 1.7.9 ------------------------ o covplot support GRangesList <2016-02-24, Wed> o update ReactomePA citation info <2016-02-17, Wed> CHANGES IN VERSION 1.7.8 ------------------------ o fixed BUG of Peaks upstream first or downstream last gene not annotated <2016-01-20, Wed> + contributed by Michael Kluge + see https://github.com/GuangchuangYu/ChIPseeker/pull/24 CHANGES IN VERSION 1.7.7 ------------------------ o bug fixed in newly introduced parameter 'overlap'. solve NA issue. <2016-01-13, Wed> CHANGES IN VERSION 1.7.6 ------------------------ o introduce 'overlap' parameter in annotatePeak, by default overlap="TSS" and only overlap with TSS will be reported as the nearest gene. if overlap="all", then gene overlap with peak will be reported as nearest gene, no matter the overlap is at TSS region or not. <2016-01-12, Tue> o bug fixed in find overlap with peaks have strand info. <2016-01-12, Tue> + see https://github.com/GuangchuangYu/ChIPseeker/issues/23 CHANGES IN VERSION 1.7.5 ------------------------ o add paramters, sameStrand,ignoreOverlap, ignoreUpstream and ignoreDownstream in annotatePeak <2016-01-10, Sun> + see https://github.com/GuangchuangYu/ChIPseeker/issues/17 o bug fixed in peak orientation <2016-01-10, Sun> + see https://github.com/GuangchuangYu/ChIPseeker/issues/22 CHANGES IN VERSION 1.7.4 ------------------------ o stop if input list of csAnno object has no name attribute + see https://github.com/GuangchuangYu/ChIPseeker/issues/21 + plotAnnoBar + plotDistToTSS o [covplot] xlim now not only restrict the window of data but also set the limit of the graphic object <2015-12-30, Wed> + see https://github.com/GuangchuangYu/ChIPseeker/issues/20 CHANGES IN VERSION 1.7.3 ------------------------ o fixed R check <2015-12-29, Tue> CHANGES IN VERSION 1.7.2 ------------------------ o use geom_rect instead of geom_segment in covplot <2015-11-30, Mon> o open lower parameter (by default =1) to specific lower cutoff of coverage signal <2015-11-29, Sun> o fixed covplot to work with None RleViews of specific chromosome <2015-11-29, Sun> o addFlankGeneInfo now works with level="gene" <2015-11-19, Thu> + see https://github.com/GuangchuangYu/ChIPseeker/issues/18 CHANGES IN VERSION 1.7.1 ------------------------ o fixed extracting ID type from TxDb object, since the change of metadata(TxDb). now using grep to extract. <2015-10-27, Tue> o add vp parameter to set viewport of vennpie on top of upsetplot by user request <2015-10-26, Mon> + see http://ygc.name/2015/07/28/upsetplot-in-chipseeker/#comment-19470 o getBioRegion function <2015-10-20, Tue> + see https://github.com/GuangchuangYu/ChIPseeker/issues/16 CHANGES IN VERSION 1.7.0 ------------------------ o BioC 3.3 branch CHANGES IN VERSION 1.5.11 ------------------------ o remove ellipsis parameter in enrichPeakOverlap function and extend it to support GRanges objects <2015-10-08, Thu> + see https://support.bioconductor.org/p/73069/ o fixed the issue, https://github.com/GuangchuangYu/ChIPseeker/issues/13 <2015-10-05, Mon> o update GEO info, now contains >18,000 bed file information <2015-09-24, Thu> CHANGES IN VERSION 1.5.10 ------------------------ o dropAnno function, eg. drop nearest gene annotation that far from TSS (>10k). <2015-09-17, Thu> + see https://github.com/GuangchuangYu/ChIPseeker/issues/9 + add parameter distanceToTSS_cutoff to enrichAnnoOverlap o use base::subset in plotDistToTSS instead of subsetting data within geom_bar <2015-09-17, Thu> + see https://github.com/hadley/ggplot2/issues/1295 + subset parameter in layer will be removed in next release of ggplot2. CHANGES IN VERSION 1.5.9 ------------------------ o bug fixed of enrichAnnoOverlap <2015-08-26, Wed> o change parameter order.matrix to order.by in upsetplot to meet the change of UpSetR pkg <2015-08-26, Wed> CHANGES IN VERSION 1.5.8 ------------------------ o better implementation of getFirstHitIndex. <2015-07-29, Wed> + contributed by Herve Pages. + see https://support.bioconductor.org/p/70432/#70545. CHANGES IN VERSION 1.5.7 ------------------------ o add vennpie parameter in upsetplot <2015-07-20, Mon> o upsetplot function for csAnno object <2015-07-20, Mon> CHANGES IN VERSION 1.5.6 ------------------------ o update citation info <2015-07-09, Thu> o BED file +1 shift for BED coordinate system start at 0 <2015-07-07, Tue> CHANGES IN VERSION 1.5.5 ------------------------ o seq2gene for linking genomic regions to genes by many-to-many mapping. <2015-06-29, Mon> CHANGES IN VERSION 1.5.4 ------------------------ o add pseudocount in enrichPeakOverlap to prevent 0 pvalue <2015-05-22, Fri> CHANGES IN VERSION 1.5.3 ------------------------ o convert the vignette from Rnw to Rmd format <2015-05-17, Sun> CHANGES IN VERSION 1.5.1 ------------------------ o minor bug fixed in getChrCov <2015-04-27, Mon> CHANGES IN VERSION 1.3.15 ------------------------ o update vignette <2015-03-31, Tue> CHANGES IN VERSION 1.3.14 ------------------------ o add pool parameter in enrichPeakOverlap <2015-03-30, Mon> CHANGES IN VERSION 1.3.13 ------------------------ o update enrichPeakOverlap to support nShuffle = 0, which now will report only overlay with pvalue = NA <2015-03-29, Sun> o add facet and free_y parameter for plotAvgProf and plotAvgProf2 <2015-03-29, Sun> o update docs <2015-03-29, Sun> o update plotAvgProf and plotAvgProf2 to fully supporting confidence interval, see https://github.com/GuangchuangYu/ChIPseeker/pull/6 <2015-03-29, Sun> CHANGES IN VERSION 1.3.12 ------------------------ o add confidence interval for plotAvgProf, see https://github.com/GuangchuangYu/ChIPseeker/issues/3 <2015-03-26, Thu> CHANGES IN VERSION 1.3.11 ------------------------ o add citation <2015-03-16, Mon> CHANGES IN VERSION 1.3.10 ------------------------ o update GEO data <2015-03-03, Tue> CHANGES IN VERSION 1.3.9 ------------------------ o add parameter *genomicAnnotationPriority* for annotatePeak function <2015-02-27, Fri> CHANGES IN VERSION 1.3.8 ------------------------ o add DOSE citation <2015-02-13, Fri> CHANGES IN VERSION 1.3.7 ------------------------ o bug fixed in plotDistToTSS <2015-02-06, Fri> CHANGES IN VERSION 1.3.6 ------------------------ o when peak is exactly located at gene end and near the end of chromosome, NA will be generated and throw error when assigning downstream of gene end. This bug has been fixed <2015-02-03, Tue> CHANGES IN VERSION 1.3.5 ------------------------ o bug fixed in getNearestFeatureIndicesAndDistances when peak in the very begining or end of the chromosome <2015-01-30, Fri> CHANGES IN VERSION 1.3.4 ------------------------ o bug fixed for introducing dplyr in plotDistToTSS <2015-01-28, Wed> CHANGES IN VERSION 1.3.3 ------------------------ o update vignette to use BiocStyle::latex() <2015-01-26, Mon> CHANGES IN VERSION 1.3.2 ------------------------ o fixed import issue to meet the changes of AnnotationDbi and S4Vectors <2015-01-22, Thu> CHANGES IN VERSION 1.1.21 ------------------------ o use data.table instead of data.frame to optimize covplot <2014-10-06, Mon> CHANGES IN VERSION 1.1.20 ------------------------ o update annotatePeak to store the seqinfo information <2014-09-30, Tue> o modified runValue(x) to sapply(x, runValue) <2014-09-30, Tue> CHANGES IN VERSION 1.1.19 ------------------------ o implement csAnno S4 object <2014-09-28, Sun> o modify plot function for csAnno instance <2014-09-28, Sun> o implement vennpie function <2014-09-28, Sun> CHANGES IN VERSION 1.1.17 ------------------------ o deprecate plotChrCov to new function covplot <2014-08-18, Mon> o add new paramter chrs and xlim to covplot <2014-08-18, Mon> CHANGES IN VERSION 1.1.16 ------------------------ o optimize plotChrCov, running time reduce drastically <2014-08-15, Fri> CHANGES IN VERSION 1.1.15 ------------------------ o remove un-mappable peak to prevent fail in peak annotation <2014-08-14, Thu> CHANGES IN VERSION 1.1.14 ------------------------ o bug fixed in plotDistToTSS <2014-08-14, Thu> CHANGES IN VERSION 1.1.13 ------------------------ o change TranscriptDb to TxDb according to GenomicFeatures <2014-07-29, Tue> CHANGES IN VERSION 1.1.12 ------------------------ o bug fixed in plotChrCov <2014-07-21, Mon> CHANGES IN VERSION 1.1.10 ------------------------ o bug fixed in calculating distances from peak end <2014-06-18, Wed> CHANGES IN VERSION 1.1.9 ------------------------ o add level parameter to annotatePeak, and set it to "transcript" by default. Now annotatePeak will annotate peaks in transcript level except user specify level = "gene" <2014-06-16, Mon> o add addFlankGeneInfo parameter to annotatePeak. If it set to true, all features within the flankDistance will be annotated. <2014-06-16, Mon> CHANGES IN VERSION 1.1.8 ------------------------ o bug fixed when peak overlap with feature <2014-06-11, Wed> o optimize for getting overlap features of peaks <2014-06-11, Wed> o update plotAnnoPie, separate the pie and legend to prevent label overlap <2014-06-12, Thu> CHANGES IN VERSION 1.1.7 ------------------------ o bug fixed in calculating distanceToTSS <2014-06-03, Tue> CHANGES IN VERSION 1.1.6 ------------------------ o add chainFile parameter in enrichAnnoOverlap and enrichPeakOverlap to support different genome version comparision <2014-06-01, Sun> o fixed color bug in peakHeatmap.internal2 and plotAnnoBar <2014-06-02, Mon> o update vignettes <2014-06-02, Mon> CHANGES IN VERSION 1.1.5 ------------------------ o export getPromoters and getTagMatrix <2014-05-31, Sat> o rename plotAvgProf to plotAvgProf2 and implement plotAvgProf based on tagMatrix <2014-05-31, Sat> o implement tagHeatmap for visualize heatmap of the tagMatrix or a list of tagMatrix <2014-05-31, Sat> o implement shuffle function to generate a random ChIP data based on a real one <2014-05-31, Sat> o implement enrichPeakOverlap to calcuate significant of ChIP experiments based on the genome coordinations <2014-05-31, Sat> o implement enrichAnnoOverlap to calculate significant of ChIP experiments based on their nearest gene annotation <2014-05-31, Sat> o incorporate GEO database for mining significant overlap of ChIP data <2014-05-31, Sat> + getGEOspecies summarize the collected data by species + getGEOgenomeVersion summarize the colleted data by genome version + getGEOInfo extract the information by genome version query + downloadGEObedFiles download all bed files of a particular genome version + downloadGSMbedFiles download the bed files of the input GSM list. CHANGES IN VERSION 1.1.4 ------------------------ o in the annotation column of output of annotatePeak function, if Exon/Intron, the output change to 'Transcript_Name/GeneID, Exon no. of total_no.' <2014-05-14, Wed> CHANGES IN VERSION 1.1.3 ------------------------ o bug fixed when metadata(TranscriptDb) contained NA <2014-04-30, Wed> o support ID type of Ensembl in annotatePeak (Entrez was supported) <2014-04-30, Wed> CHANGES IN VERSION 1.1.2 ------------------------ o implemented plotChrCov <2014-04-25, Fri> o implemented plotAvgProf and peakHeatmap <2014-04-24, Thu> CHANGES IN VERSION 1.1.1 ------------------------ o output of annotatePeak now contain chromosome length information <2014-04-22, Tue> o re-implement plotAnnoPie to use ordinary pie plot instead of pie3D <2014-04-21, Mon> CHANGES IN VERSION 1.0.0 ------------------------ o initial version with the following functions implemented: + annotatePeak + overlap + plotAnnoBar + plotAnnoPie + plotDistToTSS + readPeakFile + vennplot + vennplot.peakfile ================================================ FILE: NEWS.md ================================================ # ChIPseeker 1.48.0 + Bioconductor RELEASE_3_23 (2026-04-29, Wed) # ChIPseeker 1.47.1 + fixed issue in 'test-txdb.R' as 'TxDb.Hsapiens.UCSC.hg19.knownGene' changes its transcript ID from UCSC (e.g., uc002qsd.4) to Ensembl (e.g., ENST00000487630.1_3) (2025-11-04, Tue) # ChIPseeker 1.46.0 + Bioconductor RELEASE_3_22 (2025-11-01, Sat) # ChIPseeker 1.45.2 + new cache mechanism from 'yulab.utils' (2025-10-15, Wed) # ChIPseeker 1.44.0 + Bioconductor RELEASE_3_21 (2025-04-17, Thu) # ChIPseeker 1.42.0 + Bioconductor RELEASE_3_20 (2024-10-30, Wed) # ChIPseeker 1.41.3 + Better `covplot()`. Support universal chromosome names, and keep the default order of multiple peaks when plot a list of `GRanges` object. + Robust `generate_colors()`. Edit the logical of decision, and can validate color code automatically. + Extend dplyr verbs (`filter()`, `mutate()`, `arrange()`, `rename()`) to peak (`GRanges` object or `data.frame`), see #242. # ChIPseeker 1.41.2 + Enhancement of `plotDistToTSS()`, see #241. # ChIPseeker 1.41.1 + use `yulab.utils::yulab_msg()` for startup message (2024-07-26, Fri) # ChIPseeker 1.40.0 + Bioconductor RELEASE_3_19 (2024-05-15, Wed) # ChIPseeker 1.38.0 + Bioconductor RELEASE_3_18 (2023-10-25, Wed) # ChIPseeker 1.36.0 + Bioconductor RELEASE_3_17 (2023-05-03, Wed) # ChIPseeker 1.35.3 + fixed R check by removing calling `BiocStyle::Biocpkg()` in vignette, instead we use `yulab.utils::Biocpkg()` (2023-04-11, Tue) # ChIPseeker 1.35.2 + fixed R check by adding 'prettydoc' to Suggests (2023-04-04, Tue) # ChIPseeker 1.35.1 + use `ggplot` to plot heatmap (2022-12-30, Fri, #203) + update startup message to display the 'Current Protocols (2022)' paper. # ChIPseeker 1.34.0 + Bioconductor RELEASE_3_16 (2022-11-02, Wed) # ChIPseeker 1.33.4 + add citation Q. Wang (2022) (2022-10-29, Sat) # ChIPseeker 1.33.3 + allows passing user defined color to `vennpie()` (2022-10-20, Thu, #202, #207) + add `columns` paramter to `annotatePeak()` to better support passing `EnsDb` to `annoDb` (#193, #205) + export `getAnnoStat()` (#200, #204) # ChIPseeker 1.33.2 + supports `by = "ggVennDiagram"` in `vennplot` function (2022-09-13, Tue) # ChIPseeker 1.33.1 + `plotPeakProf()` allows passing GRanges object or a list of GRanges objects to TxDb parameter (2022-06-04, Sat) + add test files for `getTagMatrix()` and `plotTagMatrix()` + `getBioRegion()` supports UTR regions (3'UTR + 5'UTR) + `makeBioRegionFromGranges()` supports generating windoes from self-made GRanges object + allow specify colors in `covplot()` (2022-05-09, Mon, #185, #188) # ChIPseeker 1.32.0 + Bioconductor 3.15 release # ChIPseeker 1.31.4 + `readPeakFile` now supports `.broadPeak` and `.gappedPeak` files (2021-12-17, Fri, #173) # ChIPseeker 1.31.3 + bug fixed of determining promoter region in minus strand (2021-12-16, Thu, #172) # ChIPseeker 1.31.2 + update vignette # ChIPseeker 1.31.1 + bug fixed to take strand information (2021-11-10, Wed, #167) # ChIPseeker 1.30.0 + Bioconductor 3.14 release # ChIPseeker 1.29.2 + extend functions for plotting peak profiles to support other types of bioregions (2021-10-15, Fri, @MingLi-929, #156, #160, #162, #163) # ChIPseeker 1.29.1 + add example for `seq2gene` function (2021-05-21, Fri) # ChIPseeker 1.28.0 + Bioconductor 3.13 release (2021-05-20, Thu) # ChIPseeker 1.27.5 + update GEO data (103398/1973025 GSM) (2021-05-14, Fri) # ChIPseeker 1.27.4 + bug fixed in determine downstream gene (2021-04-27, Thu) - + `getBioRegion` now supports '3UTR' and '5UTR' (2021-03-30, Tue) - # ChIPseeker 1.27.3 + add two parameter, cex and radius, to `plotAnnoPie` (2021-03-12, Fri) - # ChIPseeker 1.27.2 + bug fixed of `getGenomicAnnotation` (2021-03-03, Wed) - # ChIPseeker 1.27.1 + Add support for `EnsDb` annotation databases in `annotatePeak`. - # ChIPseeker 1.26.0 + Bioconductor 3.12 release (2020-10-28, Wed) # ChIPseeker 1.23.1 + update GEO data (51079/762820 GSM) (2019-12-20, Fri) # ChIPseeker 1.22.0 + Bioconductor 3.10 release # ChIPseeker 1.21.1 + new implementation of `upsetplot` (2019-08-29, Thu) - use `ggupset`, `ggimage` and `ggplotify` + `subset` method for `csAnno` object (2019-08-27, Tue) # ChIPseeker 1.20.0 + Bioconductor 3.9 release # ChIPseeker 1.19.1 + add `origin_label = "TSS"` parameter to `plotAvgProf` (2018-12-12, Wed) - # ChIPseeker 1.18.0 + Bioconductor 3.8 release # ChIPseeker 1.17.2 + add `flip_minor_strand` parameter in `getTagMatrix` (2018-08-10, Fri) - should set to FALSE if windows if not symetric # ChIPseeker 1.17.1 + fixed issue of `vennpie` by adding pseudo-count +1 (2018-07-21, Sat) - # ChIPseeker 1.16.0 + Bioconductor 3.7 release # ChIPseeker 1.15.4 + If the required input is a named list and user input a list without name, set the name automatically and throw warning msg instead of error <2018-03-14, Wed> - + change `plotAvgProf`'s default y label <2018-03-14, Wed> - + plotAnnoBar now visualize barplot according to the order of input list (y-axis) (2018-02-27, Tue) - + follow renaming of RangesList class -> IntegerRangesList in IRanges v2.13.12 - # ChIPseeker 1.15.3 + options to ignore '1st exon', '1st intron', 'downstream' and promoter subcategory when summarizing result and visualization (2018-01-09, Tue) - + throw msg of 'file not found and skip' when requested url is not available when downloading BED file from GEO (2017-12-28, Thu) - + bug fixed of getGene (2017-12-27, Wed) ================================================ FILE: R/AllGenerics.R ================================================ ##' vennpie method generics ##' ##' ##' @docType methods ##' @name vennpie ##' @rdname vennpie-methods ##' @export setGeneric("vennpie", function(x, r = 0.2, cex = 1.2, ...) standardGeneric("vennpie") ) ##' plotDistToTSS method generics ##' ##' ##' @docType methods ##' @name plotDistToTSS ##' @rdname plotDistToTSS-methods ##' @export setGeneric("plotDistToTSS", function(x, distanceColumn="distanceToTSS", xlab="", ylab="Binding sites (%) (5'->3')", title="Distribution of transcription factor-binding loci relative to TSS", ...) standardGeneric("plotDistToTSS") ) ##' plotAnnoBar method generics ##' ##' ##' @docType methods ##' @name plotAnnoBar ##' @rdname plotAnnoBar-methods ##' @export setGeneric("plotAnnoBar", function(x, xlab="", ylab="Percentage(%)", title="Feature Distribution", ...) standardGeneric("plotAnnoBar") ) ##' plotAnnoPie method generics ##' ##' ##' @docType methods ##' @name plotAnnoPie ##' @rdname plotAnnoPie-methods ##' @export setGeneric("plotAnnoPie", function(x, ndigit=2, cex=0.9, col=NA, legend.position="rightside", pie3D=FALSE, radius=0.8, ...) standardGeneric("plotAnnoPie") ) ================================================ FILE: R/ChIPseeker-package.R ================================================ #' @keywords internal "_PACKAGE" ##' Information Datasets ##' ##' ucsc genome version, precalcuated data and gsm information ##' ##' @name info ##' @aliases ucsc_release ##' gsminfo ##' tagMatrixList ##' @docType data ##' @keywords datasets NULL ##' Name of the ChIPseeker cache environment (internal static variable) ##' @format character vector ChIPseekerCache <- "ChIPseekerEnv" ================================================ FILE: R/GEO.R ================================================ ######################################## ## ## ## data last update: Mar 03, 2015 ## ## ## ######################################## ##' accessing species statistics collecting from GEO database ##' ##' ##' @title getGEOspecies ##' @return data.frame ##' @author G Yu ##' @export getGEOspecies <- function() { gsminfo <- get_gsminfo() species <- gsminfo$organism res <- as.data.frame(table(species)) return(res) } ##' get genome version statistics collecting from GEO ChIPseq data ##' ##' ##' @title getGEOgenomeVersion ##' @return data.frame ##' @author G Yu ##' @export getGEOgenomeVersion <- function() { gsminfo <- get_gsminfo() gv <- gsminfo[, c("organism", "genomeVersion")] genomeVersion <- gv$genomeVersion res <- as.data.frame(table(genomeVersion)) gv <- unique(gv) res <- merge(gv, res, by.x="genomeVersion", by.y="genomeVersion", all.y=TRUE) res <- res[, c("organism", "genomeVersion", "Freq")] return(res) } ##' get subset of GEO information by genome version keyword ##' ##' ##' @title getGEOInfo ##' @param genome genome version ##' @param simplify simplify result or not ##' @return data.frame ##' @author G Yu ##' @export getGEOInfo <- function(genome, simplify =TRUE) { gsminfo <- get_gsminfo() genomeVersion <- NULL ## to satisfy codetools res <- subset(gsminfo, subset = genomeVersion == genome) if (simplify) { res <- res[,c("series_id", "gsm", "organism", "title", "supplementary_file", "genomeVersion", "pubmed_id")] } return(res) } ##' download all BED files of a particular genome version ##' ##' ##' @title downloadGEObedFiles ##' @param genome genome version ##' @param destDir destination folder ##' @return NULL ##' @author G Yu ##' @export downloadGEObedFiles <- function(genome, destDir=getwd()) { info <- getGEOInfo(genome) downloadGEO.internal(info, destDir) } ##' download BED supplementary files of a list of GSM accession numbers ##' ##' ##' @title downloadGSMbedFiles ##' @param GSM GSM accession numbers ##' @param destDir destination folder ##' @return NULL ##' @author G Yu ##' @export downloadGSMbedFiles <- function(GSM, destDir=getwd()) { gsminfo <- get_gsminfo() info <- gsminfo[gsminfo$gsm %in% GSM,] downloadGEO.internal(info, destDir) } ##' @importFrom utils download.file downloadGEO.internal <- function(info, destDir) { fnames <- as.character(info$supplementary_file) destfiles <- sub(".*\\/", paste(destDir, "/", sep=""), fnames) names(destfiles) <- NULL for (i in seq_along(fnames)) { if ( ! file.exists(destfiles[i]) ) tryCatch(download.file(fnames[i], destfile=destfiles[i], mode="wb"), error = function(e) message(fnames[i], ': file not found and skip')) } } ##' @importFrom utils data ## @importFrom GEOmetadb ## @importFrom RSQLite dbConnect ## @importFrom RSQLite dbGetQuery prepareGSMInfo <- function() { pkg <- "GEOmetadb" require(pkg, character.only=TRUE) getSQLiteFile <- eval(parse(text="getSQLiteFile")) ## get the latest version of sql file is.dl <- tryCatch(getSQLiteFile(), error = function(e) NULL) if (is.null(is.dl)) { url <- 'http://starbuck1.s3.amazonaws.com/sradb/GEOmetadb.sqlite.gz' HEAD <- eval(parse(text = "httr::HEAD")) hh <- HEAD(url) size <- hh$headers[["content-length"]] cmd <- paste('wget -c', url) while(file.info("GEOmetadb.sqlite.gz")$size < size) { system(cmd) } if (file.exists('GEOmetadb.sqlite') && file.exists('GEOmetadb.sqlite.gz')) { file.remove("GEOmetadb.sqlite") } system('gunzip GEOmetadb.sqlite.gz') } GEOmetadbFile="GEOmetadb.sqlite" file.info(GEOmetadbFile) sqlpkg <- "RSQLite" require(sqlpkg, character.only=TRUE) dbConnect <- eval(parse(text="dbConnect")) dbGetQuery <- eval(parse(text="dbGetQuery")) SQLite <- eval(parse(text="SQLite")) con <- dbConnect(SQLite(),GEOmetadbFile) ## dbListTables(con) pkg <- "GEOquery" require(pkg, character.only=TRUE) getGEO <- eval(parse(text="getGEO")) Meta <- eval(parse(text="Meta")) ## get all GPL IDs ## download soft using gpl = getGEO("GPLXXX") ## using Meta(gpl) find the technology match sequencing ## get all gsm IDs ## parse it gpl <- dbGetQuery(con, 'select gpl, technology from gpl') gpl <- gpl[gpl[,2] == "high-throughput sequencing",1] gpl <- gpl[!is.na(gpl)] ## save the processedGSM vector that contain all the GSM that have been processed. ## next time when preparing GSMInfo, filter those have been processed before. load(system.file("extdata/processedGSM.rda", package="ChIPseeker")) processedGSM <- get("processedGSM") newGSM <- c() gpldir <- "GPL" if (!file.exists(gpldir)) { dir.create(gpldir) } for (gid in gpl) { gg <- tryCatch(getGEO(gid, destdir=gpldir), error=function(e) NULL) if (is.null(gg)) { next } gsm <- Meta(gg)$sample_id gsm <- gsm[! (gsm %in% processedGSM) ] if (length(gsm) == 0) { next } newGSM <- c(newGSM, gsm) sf <- batchGetGSMsuppFile(gsm) if (!is.null(sf)) { save(sf, file=paste(gid, "_sf.rda", sep="")) } } processedGSM <- c(processedGSM, newGSM) processedGSM <- unique(processedGSM) save(processedGSM, file="../processedGSM.rda", compress="xz") sfiles <- list.files(pattern="_sf.rda") res <- data.frame(gsm=NULL, remoteFile=NULL) for (ff in sfiles) { load(ff) if (!is.null(sf)) { res <- rbind(res, sf) } } colnames(res)[2] <- "supplementary_file" GSMInfo <- lapply(unique(as.character(res$gsm)), function(i) { dbGetQuery(con,paste("select gsm,series_id,gpl,organism_ch1,title,characteristics_ch1,source_name_ch1,extract_protocol_ch1,description,data_processing,submission_date ", "from gsm where gsm='", i, "'", sep="")) }) GSMInfo <- do.call("rbind", GSMInfo) colnames(GSMInfo) <- sub("_ch1", "", colnames(GSMInfo)) gsminfo <- merge(GSMInfo, res, by.x="gsm", by.y="gsm") tryCatch(utils::data("ucsc_release", package="ChIPseeker")) ucsc_release <- get("ucsc_release") genVer <- lapply(1:nrow(gsminfo), function(i) getGenomicVersion(ucsc_release, gsminfo[i, "data_processing"], gsminfo[i, "organism"], gsminfo[i, "supplementary_file"]) ) gsminfo$genomeVersion <- unlist(genVer) gse <- as.character(gsminfo$series_id) pubmed <- lapply(gse, function(i) { dbGetQuery(con,paste("select gse,pubmed_id ", "from gse where gse='", i, "'", sep="")) }) pm <- do.call(rbind, pubmed) pm <- unique(pm) gsminfo <- merge(gsminfo, pm, by.x="series_id", by.y="gse", all.x=TRUE) ## remove non-ASCII characters for(i in 1:ncol(gsminfo)) { gsminfo[,i] = iconv(gsminfo[,i], "latin1", "ASCII", sub="") } gsminfo2 <- gsminfo rm(gsminfo) utils::data("gsminfo", package="ChIPseeker") gsminfo <- get("gsminfo") gsminfo <- rbind(gsminfo, gsminfo2) gsminfo <- unique(gsminfo) save(gsminfo, file="../gsminfo.rda", compress="xz") } getGenomicVersion <- function(ucsc_release, data_processing, organism, supplementary_file) { data_processing <- as.character(data_processing) organism <- as.character(organism) supplementary_file <- as.character(supplementary_file) species <- NULL gs <- subset(ucsc_release, subset = species == organism) if (nrow(gs) == 0) return(NA) genMatch <- unlist(sapply(gs$ucsc_version, grep, data_processing)) if (length(genMatch) == 0) { genMatch <- unlist(sapply(gs$ucsc_version, grep, supplementary_file)) if (length(genMatch) == 0) { return(NA) } } genVer <- names(genMatch) if (length(genVer) > 1) { genVer <- genVer[1] } return(genVer) } ## getGSE_ENCODE <- function() { ## encode=readLines("http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html") ## encode.chipseq <- encode[grep("ChIP-Seq", encode)] ## ## require(gsubfn) ## ## gse <- sapply(encode.chipseq, function(i) { ## ## res <- strapply(i, "(GSE\\d+)") ## ## unique(unlist(res)) ## ## }) ## gse <- sapply(encode.chipseq, gsub, pattern='.*(GSE\\d+).*', replacement='\\1') ## names(gse) <- NULL ## return(gse) ## } ## GSE2GSM <- function(GSE) { ## info <- getGEO(GSE, GSEMatrix=FALSE) ## metaInfo <- Meta(info) ## gsm <- metaInfo$sample_id ## return(gsm) ## } ##' @importFrom parallel mclapply ##' @importFrom parallel detectCores batchGetGSMsuppFile <- function(gsm) { suppfiles <- mclapply(seq_along(gsm), function(i) { cat("processing ", gsm[i], "\t", i , " of ", length(gsm), "\n") tryCatch(getGSMsuppFile(gsm[i]), error=function(e) NULL) }, mc.cores=detectCores()) suppfiles <- suppfiles[!unlist(lapply(suppfiles, is.null))] sf <- do.call("rbind", suppfiles) return(sf) } getGSMsuppFile <- function(GSM) { pkg <- "GEOquery" require(pkg, character.only=TRUE) getGEO <- eval(parse(text="getGEO")) Meta <- eval(parse(text="Meta")) destdir="geo_soft" if (!file.exists(destdir)) { dir.create(destdir) } info <- getGEO(GSM, GSEMatrix=FALSE, destdir=destdir) ## http://www.ncbi.nlm.nih.gov/geo/info/soft2.html metaInfo <- Meta(info) ## suppmentary file names fnames <- unlist(metaInfo[grep("supplementary", names(metaInfo))]) names(fnames) <- NULL i <- c(grep("bed.gz", fnames), grep("Peak.gz", fnames), grep("bedGraph.gz", fnames)) if (length(i) == 0) { message("No bed files found") return(NULL) } fnames <- fnames[i] res <- data.frame(gsm=GSM, remoteFile = fnames) return(res) } get_gsminfo <- function() { tryCatch(utils::data("gsminfo", package="ChIPseeker")) gsminfo <- get("gsminfo") return(gsminfo) } ================================================ FILE: R/addGeneAnno.R ================================================ ##' get gene annotation, symbol, gene name etc. ##' ##' ##' @title getGeneAnno ##' @param annoDb annotation package ##' @param geneID query geneID ##' @param type gene ID type ##' @param columns names of columns to be obtained from database ##' @return data.frame ##' @importFrom AnnotationDbi select ##' @author G Yu getGeneAnno <- function(annoDb, geneID, type, columns){ kk <- unlist(geneID) require(annoDb, character.only = TRUE) annoDb <- eval(parse(text=annoDb)) if (type == "Entrez Gene ID") { kt <- "ENTREZID" } else if (type =="Ensembl gene ID" || type == "Ensembl Gene ID") { kt <- "ENSEMBL" } else { message("geneID type is not supported...\tPlease report it to developer...\n") return(NA) } i <- which(!is.na(kk)) kk <- gsub("\\.\\d+$", "", kk) ann <- tryCatch( suppressWarnings(select(annoDb, keys=unique(kk[i]), keytype=kt, columns=columns)), error = function(e) NULL) if (is.null(ann)) { warning("ID type not matched, gene annotation will not be added...") return(NA) } idx <- getFirstHitIndex(ann[,kt]) ann <- ann[idx,] ## idx <- unlist(sapply(kk, function(x) which(x==ann[,kt]))) ## res <- matrix(NA, ncol=ncol(ann), nrow=length(kk)) %>% as.data.frame ## colnames(res) <- colnames(ann) ## res[i,] <- ann[idx,] rownames(ann) <- ann[, kt] res <- ann[as.character(kk),] return(res) } addGeneAnno <- function(peak.gr, annoDb, type, columns) { geneAnno <- getGeneAnno(annoDb, peak.gr$geneId, type, columns) if (! all(is.na(geneAnno))) { for(cn in colnames(geneAnno)[-1]) { mcols(peak.gr)[[cn]] <- geneAnno[, cn] } } return(peak.gr) } ================================================ FILE: R/annotatePeak.R ================================================ ##' Annotate peaks ##' ##' ##' @title annotatePeak ##' @param peak peak file or GRanges object ##' @param tssRegion Region Range of TSS ##' @param TxDb TxDb or EnsDb annotation object ##' @param level one of transcript and gene ##' @param assignGenomicAnnotation logical, assign peak genomic annotation or not ##' @param genomicAnnotationPriority genomic annotation priority ##' @param annoDb annotation package ##' @param addFlankGeneInfo logical, add flanking gene information from the peaks ##' @param flankDistance distance of flanking sequence ##' @param sameStrand logical, whether find nearest/overlap gene in the same strand ##' @param ignoreOverlap logical, whether ignore overlap of TSS with peak ##' @param ignoreUpstream logical, if True only annotate gene at the 3' of the peak. ##' @param ignoreDownstream logical, if True only annotate gene at the 5' of the peak. ##' @param overlap one of 'TSS' or 'all', if overlap="all", then gene overlap with peak will be reported as nearest gene, no matter the overlap is at TSS region or not. ##' @param verbose print message or not ##' @param columns names of columns to be obtained from database ##' @return data.frame or GRanges object with columns of: ##' ##' all columns provided by input. ##' ##' annotation: genomic feature of the peak, for instance if the peak is ##' located in 5'UTR, it will annotated by 5'UTR. Possible annotation is ##' Promoter-TSS, Exon, 5' UTR, 3' UTR, Intron, and Intergenic. ##' ##' geneChr: Chromosome of the nearest gene ##' ##' geneStart: gene start ##' ##' geneEnd: gene end ##' ##' geneLength: gene length ##' ##' geneStrand: gene strand ##' ##' geneId: entrezgene ID ##' ##' distanceToTSS: distance from peak to gene TSS ##' ##' if annoDb is provided, extra column will be included: ##' ##' ENSEMBL: ensembl ID of the nearest gene ##' ##' SYMBOL: gene symbol ##' ##' GENENAME: full gene name ##' @import BiocGenerics S4Vectors GenomeInfoDb ##' @examples ##' \dontrun{ ##' require(TxDb.Hsapiens.UCSC.hg19.knownGene) ##' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene ##' peakfile <- system.file("extdata", "sample_peaks.txt", package="ChIPseeker") ##' peakAnno <- annotatePeak(peakfile, tssRegion=c(-3000, 3000), TxDb=txdb) ##' peakAnno ##' } ##' @seealso \code{\link{plotAnnoBar}} \code{\link{plotAnnoPie}} \code{\link{plotDistToTSS}} ##' @export ##' @author G Yu annotatePeak <- function(peak, tssRegion=c(-3000, 3000), TxDb=NULL, level = "transcript", assignGenomicAnnotation=TRUE, genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic"), annoDb=NULL, addFlankGeneInfo=FALSE, flankDistance=5000, sameStrand = FALSE, ignoreOverlap=FALSE, ignoreUpstream=FALSE, ignoreDownstream=FALSE, overlap = "TSS", verbose=TRUE, columns=c("ENTREZID", "ENSEMBL", "SYMBOL", "GENENAME")) { is_GRanges_of_TxDb <- FALSE if (is(TxDb, "GRanges")) { is_GRanges_of_TxDb <- TRUE assignGenomicAnnotation <- FALSE annoDb <- NULL addFlankGeneInfo <- FALSE message("#\n#.. 'TxDb' is a self-defined 'GRanges' object...\n#") message("#.. Some parameters of 'annotatePeak' will be disable,") message("#.. including:") message("#..\tlevel, assignGenomicAnnotation, genomicAnnotationPriority,") message("#..\tannoDb, addFlankGeneInfo and flankDistance.") message("#\n#.. Some plotting functions are designed for visualizing genomic annotation") message("#.. and will not be available for the output object.\n#") } if (is_GRanges_of_TxDb) { level <- "USER_DEFINED" } else { level <- match.arg(level, c("transcript", "gene")) } if (assignGenomicAnnotation && all(genomicAnnotationPriority %in% c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic")) == FALSE) { stop('genomicAnnotationPriority should be any order of c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic")') } if ( is(peak, "GRanges") ){ ## this test will be TRUE ## when peak is an instance of class/subclass of "GRanges" input <- "gr" peak.gr <- peak } else { input <- "file" peak.gr <- loadPeak(peak, verbose) } peakNum <- length(peak.gr) if (verbose) cat(">> preparing features information...\t\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") if (is_GRanges_of_TxDb) { features <- TxDb } else { TxDb <- loadTxDb(TxDb) if (level=="transcript") { features <- getGene(TxDb, by="transcript") } else { features <- getGene(TxDb, by="gene") } } if (verbose) cat(">> identifying nearest features...\t\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") ## nearest features idx.dist <- getNearestFeatureIndicesAndDistances(peak.gr, features, sameStrand, ignoreOverlap, ignoreUpstream,ignoreDownstream, overlap=overlap) if (verbose) cat(">> calculating distance from peak to TSS...\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") ## distance distance <- idx.dist$distance ## update peak, remove un-map peak if exists. peak.gr <- idx.dist$peak ## annotation if (assignGenomicAnnotation == TRUE) { if (verbose) cat(">> assigning genomic annotation...\t\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") anno <- getGenomicAnnotation(peak.gr, distance, tssRegion, TxDb, level, genomicAnnotationPriority, sameStrand=sameStrand) annotation <- anno[["annotation"]] detailGenomicAnnotation <- anno[["detailGenomicAnnotation"]] } else { annotation <- NULL detailGenomicAnnotation <- NULL } ## append annotation to peak.gr if (!is.null(annotation)) mcols(peak.gr)[["annotation"]] <- annotation has_nearest_idx <- which(idx.dist$index <= length(features)) nearestFeatures <- features[idx.dist$index[has_nearest_idx]] ## duplicated names since more than 1 peak may annotated by only 1 gene names(nearestFeatures) <- NULL nearestFeatures.df <- as.data.frame(nearestFeatures) if (is_GRanges_of_TxDb) { colnames(nearestFeatures.df)[1:5] <- c("geneChr", "geneStart", "geneEnd", "geneLength", "geneStrand") } else if (level == "transcript") { if (is(TxDb, "EnsDb")) { nearestFeatures.df <- nearestFeatures.df[, c("seqnames", "start", "end", "width", "strand", "gene_id", "tx_id", "tx_biotype"), drop = FALSE] colnames(nearestFeatures.df) <- c( "geneChr", "geneStart", "geneEnd", "geneLength", "geneStrand", "geneId", "transcriptId", "transcriptBiotype") } else { colnames(nearestFeatures.df) <- c("geneChr", "geneStart", "geneEnd", "geneLength", "geneStrand", "geneId", "transcriptId") nearestFeatures.df$geneId <- TXID2EG( as.character(nearestFeatures.df$geneId), geneIdOnly=TRUE) } } else { if (is(TxDb, "EnsDb")) { nearestFeatures.df <- nearestFeatures.df[, c("seqnames", "start", "end", "width", "strand", "gene_id", "gene_biotype"), drop = FALSE] colnames(nearestFeatures.df) <- c("geneChr", "geneStart", "geneEnd", "geneLength", "geneStrand", "geneId", "geneBiotype") } else colnames(nearestFeatures.df) <- c("geneChr", "geneStart", "geneEnd", "geneLength", "geneStrand", "geneId") } for(cn in colnames(nearestFeatures.df)) { mcols(peak.gr)[[cn]][has_nearest_idx] <- unlist(nearestFeatures.df[, cn]) } mcols(peak.gr)[["distanceToTSS"]] <- distance if (!is.null(annoDb)) { if (verbose) cat(">> adding gene annotation...\t\t\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") .idtype <- IDType(TxDb) if (length(.idtype) == 0 || is.na(.idtype) || is.null(.idtype)) { n <- length(peak.gr) if (n > 100) n <- 100 sampleID <- peak.gr$geneId[1:n] if (all(grepl('^ENS', sampleID))) { .idtype <- "Ensembl Gene ID" } else if (all(grepl('^\\d+$', sampleID))) { .idtype <- "Entrez Gene ID" } else { warning("Unknown ID type, gene annotation will not be added...") .idtype <- NA } } if (!is.na(.idtype)) { peak.gr %<>% addGeneAnno(annoDb, .idtype, columns) } } if (addFlankGeneInfo == TRUE) { if (verbose) cat(">> adding flank feature information from peaks...\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") flankInfo <- getAllFlankingGene(peak.gr, features, level, flankDistance) if (level == "transcript") { mcols(peak.gr)[["flank_txIds"]] <- NA mcols(peak.gr)[["flank_txIds"]][flankInfo$peakIdx] <- flankInfo$flank_txIds } mcols(peak.gr)[["flank_geneIds"]] <- NA mcols(peak.gr)[["flank_gene_distances"]] <- NA mcols(peak.gr)[["flank_geneIds"]][flankInfo$peakIdx] <- flankInfo$flank_geneIds mcols(peak.gr)[["flank_gene_distances"]][flankInfo$peakIdx] <- flankInfo$flank_gene_distances } if (!is_GRanges_of_TxDb) { if(verbose) cat(">> assigning chromosome lengths\t\t\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") peak.gr@seqinfo <- seqinfo(TxDb)[names(seqlengths(peak.gr))] } if(verbose) cat(">> done...\t\t\t\t\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") if (assignGenomicAnnotation) { res <- new("csAnno", anno = peak.gr, tssRegion = tssRegion, level=level, hasGenomicAnnotation = TRUE, detailGenomicAnnotation=detailGenomicAnnotation, annoStat=getGenomicAnnoStat(peak.gr), peakNum=peakNum ) } else { res <- new("csAnno", anno = peak.gr, tssRegion = tssRegion, level=level, hasGenomicAnnotation = FALSE, peakNum=peakNum ) } return(res) } ##' dropAnno ##' ##' drop annotation exceeding distanceToTSS_cutoff ##' @title dropAnno ##' @param csAnno output of annotatePeak ##' @param distanceToTSS_cutoff distance to TSS cutoff ##' @return csAnno object ##' @export ##' @author Guangchuang Yu dropAnno <- function(csAnno, distanceToTSS_cutoff=10000) { idx <- which(abs(mcols(csAnno@anno)[["distanceToTSS"]]) < distanceToTSS_cutoff) csAnno@anno <- csAnno@anno[idx] csAnno@peakNum <- length(idx) if (csAnno@hasGenomicAnnotation) { csAnno@annoStat <- getGenomicAnnoStat(csAnno@anno) csAnno@detailGenomicAnnotation = csAnno@detailGenomicAnnotation[idx,] } csAnno } ================================================ FILE: R/covplot.R ================================================ ##' plot peak coverage ##' ##' ##' @title covplot ##' @param peak peak file or GRanges object ##' @param weightCol weight column of peak ##' @param xlab xlab ##' @param ylab ylab ##' @param title title ##' @param chrs selected chromosomes to plot, all chromosomes by default ##' @param xlim ranges to plot, default is whole chromosome ##' @param lower lower cutoff of coverage signal ##' @param fill_color specify the color/palette for the plot. Order matters ##' @return ggplot2 object ##' @import GenomeInfoDb ##' @importFrom ggplot2 ggplot ##' @importFrom ggplot2 geom_segment ##' @importFrom ggplot2 geom_blank ##' @importFrom ggplot2 geom_rect ##' @importFrom ggplot2 facet_grid ##' @importFrom ggplot2 theme ##' @importFrom ggplot2 theme_classic ##' @importFrom ggplot2 element_text ##' @importFrom ggplot2 xlab ##' @importFrom ggplot2 ylab ##' @importFrom ggplot2 xlim ##' @importFrom ggplot2 ggtitle ##' @export ##' @author G Yu covplot <- function(peak, weightCol=NULL, xlab = "Chromosome Size (bp)", ylab = "", title = "ChIP Peaks over Chromosomes", chrs = NULL, xlim = NULL, lower = 1, fill_color = "black") { isList <- is.list(peak) if(!isList) { # Note: don't support data.frame tm <- getChrCov(peak = peak, weightCol = weightCol, chrs = chrs, xlim = xlim, lower = lower) } else { ltm <- lapply(peak, getChrCov, weightCol = weightCol, chrs = chrs, xlim = xlim, lower = lower) if (is.null(names(ltm))) { nn <- paste0("peak", seq_along(ltm)) warning("input is not a named list, set the name automatically to ", paste(nn, collapse = ' ')) names(ltm) <- nn } tm <- dplyr::bind_rows(ltm, .id = ".id") chr.sorted <- sortChrName(as.character(unique(tm$chr))) tm$chr <- factor(tm$chr, levels = chr.sorted) } chr <- start <- end <- value <- .id <- NULL if(length(tm$chr) == 0){ p <- ggplot(data.frame(x = 1)) + geom_blank() } else { p <- ggplot(tm, aes(start, value)) ## p <- p + geom_segment(aes(x=start, y=0, xend=end, yend= value)) if (isList) { if (length(fill_color) == length(peak) && all(is_valid_color(fill_color))){ cols = fill_color } else { cols = generate_colors(fill_color, n = length(peak)) } p <- p + geom_rect(aes(xmin = start, ymin = 0, xmax = end, ymax = value, fill = .id, color = .id)) + scale_color_manual(values = cols) + scale_fill_manual(values = cols) } else { p <- p + geom_rect(aes(xmin = start, ymin = 0, xmax = end, ymax = value), fill = fill_color, color = fill_color) } if(length(unique(tm$chr)) > 1) { p <- p + facet_grid(chr ~., scales="free") } } p <- p + theme_classic() p <- p + labs(x = xlab, y = ylab, title = title, fill = NULL, color = NULL) p <- p + scale_y_continuous(expand = c(0,0)) p <- p + theme(strip.text.y=element_text(angle=360)) p <- p + scale_x_continuous(labels = scales::label_number(scale_cut = scales::cut_si(""))) if (!is.null(xlim) && !all(is.na(xlim)) && is.numeric(xlim) && length(xlim) == 2) { p <- p + xlim(xlim) } return(p) } ##' @import S4Vectors IRanges ##' @importFrom dplyr group_by ##' @importFrom dplyr summarise ##' @importFrom magrittr %>% getChrCov <- function(peak, weightCol, chrs, xlim, lower=1) { if (is(peak, "GRanges")) { peak.gr <- peak } else if (file.exists(peak)) { peak.gr <- readPeakFile(peak, as="GRanges") } else { stop("peak should be a GRanges object or a peak file...") } if ( is.null(weightCol)) { peak.cov <- coverage(peak.gr) } else { weight <- mcols(peak.gr)[[weightCol]] peak.cov <- coverage(peak.gr, weight=weight) } cov <- lapply(peak.cov, IRanges::slice, lower=lower) get.runValue <- function(x) { y <- runValue(x) sapply(y@listData, mean) ## value <- x@subject@values ## value[value != 0] } chr <- start <- end <- cnt <- NULL ldf <- lapply(1:length(cov), function(i) { x <- cov[[i]] if (length(x@ranges) == 0) { msg <- paste0(names(cov[i]), " dosen't contain signal higher than ", lower) message(msg) return(NA) } data.frame(chr = names(cov[i]), start = start(x), end = end(x), cnt = get.runValue(x) # the following versions are more slower # unlist(runValue(x)) # sapply(x, runValue) ) }) ldf <- ldf[!is.na(ldf)] df <- do.call("rbind", ldf) chr.sorted <- sortChrName(as.character(unique(df$chr))) df$chr <- factor(df$chr, levels=chr.sorted) if (!is.null(chrs) && !all(is.na(chrs)) && all(chrs %in% chr.sorted)) { df <- df[df$chr %in% chrs, ] } if (!is.null(xlim) && !all(is.na(xlim)) && is.numeric(xlim) && length(xlim) == 2) { df <- df[df$start >= xlim[1] & df$end <= xlim[2],] } df2 <- group_by(df, chr, start, end) %>% summarise(value=sum(cnt), .groups = "drop") return(df2) } # a simple `stringr::str_sort(numeric=TRUE)` implementation sortChrName <- function(chr.name, decreasing = FALSE) { ## universal sort function, support organisms other than human chr_part <- sub("^(\\D*)(\\d*)$", "\\1", chr.name) num_part <- as.numeric(sub("^(\\D*)(\\d*)$", "\\2", chr.name)) chr.name[order(chr_part, num_part, decreasing = decreasing)] } ================================================ FILE: R/csAnno.R ================================================ ##' Class "csAnno" ##' This class represents the output of ChIPseeker Annotation ##' ##' ##' @name csAnno-class ##' @aliases csAnno-class ##' show,csAnno-method vennpie,csAnno-method ##' plotDistToTSS,csAnno-method plotAnnoBar,csAnno-method ##' plotAnnoPie,csAnno-method upsetplot,csAnno-method ##' subset,csAnno-method ##' ##' @docType class ##' @slot anno annotation ##' @slot tssRegion TSS region ##' @slot level transcript or gene ##' @slot hasGenomicAnnotation logical ##' @slot detailGenomicAnnotation Genomic Annotation in detail ##' @slot annoStat annotation statistics ##' @slot peakNum number of peaks ##' @exportClass csAnno ##' @author Guangchuang Yu \url{https://guangchuangyu.github.io} ##' @seealso \code{\link{annotatePeak}} ##' @keywords classes setClass("csAnno", representation=representation( anno = "GRanges", tssRegion = "numeric", level = "character", hasGenomicAnnotation = "logical", detailGenomicAnnotation="data.frame", annoStat="data.frame", peakNum="numeric" )) ##' convert csAnno object to GRanges ##' ##' ##' @title as.GRanges ##' @param x csAnno object ##' @return GRanges object ##' @author Guangchuang Yu \url{https://guangchuangyu.github.io} ##' @export as.GRanges <- function(x) { if (!is(x, "csAnno")) stop("not supported...") return(x@anno) } ##' getting status of annotation ##' ##' ##' @title getAnnoStat ##' @param x csAnno object ##' @export getAnnoStat <- function(x) { if (!is(x, "csAnno")) stop("not supported...") return(x@annoStat) } ##' Combine csAnno Object ##' ##' ##' https://github.com/YuLab-SMU/ChIPseeker/issues/157 ##' @title combine_csAnno ##' @param x csAnno object ##' @param ... csAnno objects ##' @return csAnno object ##' @export combine_csAnno <- function(x, ...){ z <- list(x, ...) if(sum(vapply(z, function(x) !is(x, "csAnno"), FUN.VALUE = logical(1))) != 0){ stop("not supported...") } if(length(z)<2){ stop("need two or more csAnno object...") } if(sum(!duplicated(lapply(z, function(x) x@tssRegion[1]))) != 1 && sum(!duplicated(lapply(z, function(x) x@tssRegion[2]))) != 1){ stop("the tss regions of different csAnno objects should be the same...") } if(sum(!duplicated(lapply(z, function(x) x@level))) != 1){ stop("the level of different csAnno object should be the same...") } if(sum(!duplicated(lapply(z, function(x) x@hasGenomicAnnotation))) != 1){ stop("the status of GenomicAnnotation should be the same...") } combine_tssRegion <- x@tssRegion combine_level <- x@level combine_hasGenomicAnnotation <- x@hasGenomicAnnotation combine_anno <- x@anno for(i in 2:length(z)){ combine_anno <- c(combine_anno,z[[i]]@anno) } combine_detailGenomicAnnotation <- lapply(z, function(x) x@detailGenomicAnnotation) combine_detailGenomicAnnotation <- do.call("rbind",combine_detailGenomicAnnotation) combine_peakNum <- x@peakNum for(i in 2:length(z)){ combine_peakNum <- combine_peakNum+z[[i]]@peakNum } feature <- x@annoStat$Feature for(i in 2:length(z)){ if(length(feature) dplyr::filter(!!!dots, .by = .by, .preserve = .preserve) |> droplevels() |> GenomicRanges::makeGRangesFromDataFrame(keep.extra.columns = TRUE) } # extend mutate to Peak (GRanges class object) #' @method mutate GRanges #' @importFrom dplyr mutate #' @export mutate.GRanges = function(.data, ..., .by = NULL, .keep = c("all", "used", "unused", "none"), .before = NULL, .after = NULL) { dots = rlang::quos(...) df = as.data.frame(.data) if (!is.null(.before) && !is.null(.after)) { stop("You can't supply both `.before` and `.after`.") } if (!is.null(.before)) { df = df |> dplyr::mutate(!!!dots, .by = .by, .keep = .keep, .before = .before) } else if (!is.null(.after)) { df = df |> dplyr::mutate(!!!dots, .by = .by, .keep = .keep, .after = .after) } else { df = df |> dplyr::mutate(!!!dots, .by = .by, .keep = .keep) } df |> GenomicRanges::makeGRangesFromDataFrame(keep.extra.columns = TRUE) } # S4Vectors::rename #' @method rename GRanges #' @importFrom rlang quos #' @export rename.GRanges = function(x, ...){ dots = rlang::quos(...) as.data.frame(x) |> dplyr::rename(!!!dots) |> GenomicRanges::makeGRangesFromDataFrame(keep.extra.columns = TRUE) } #' @method arrange GRanges #' @importFrom dplyr arrange #' @export arrange.GRanges = function(.data, ..., .by_group = FALSE){ dots = rlang::quos(...) as.data.frame(.data) |> dplyr::arrange(!!!dots, .by_group = .by_group) |> GenomicRanges::makeGRangesFromDataFrame(keep.extra.columns = TRUE) } ================================================ FILE: R/enrichOverlap.R ================================================ ##' calcuate overlap significant of ChIP experiments based on their nearest gene annotation ##' ##' ##' @title enrichAnnoOverlap ##' @param queryPeak query bed file ##' @param targetPeak target bed file(s) or folder containing bed files ##' @param TxDb TxDb ##' @param pAdjustMethod pvalue adjustment method ##' @param chainFile chain file for liftOver ##' @param distanceToTSS_cutoff restrict nearest gene annotation by distance cutoff ##' @return data.frame ##' @importFrom stats p.adjust ##' @importFrom stats phyper ##' @export ##' @importFrom rtracklayer import.chain ##' @importFrom rtracklayer liftOver ##' @importFrom yulab.utils get_cache_element ##' @importFrom yulab.utils update_cache_item ##' @author G Yu enrichAnnoOverlap <- function(queryPeak, targetPeak, TxDb=NULL, pAdjustMethod="BH", chainFile=NULL, distanceToTSS_cutoff=NULL) { TxDb <- loadTxDb(TxDb) query.anno <- annotatePeak(queryPeak, TxDb=TxDb, assignGenomicAnnotation=FALSE, annoDb=NULL, verbose=FALSE) if (is(targetPeak[1], "GRanges") || is(targetPeak[[1]], "GRanges")) { target.gr <- targetPeak targetFiles <- NULL } else { targetFiles <- parse_targetPeak_Param(targetPeak) target.gr <- lapply(targetFiles, loadPeak) } if (!is.null(chainFile)) { chain <- import.chain(chainFile) target.gr <- lapply(target.gr, liftOver, chain=chain) } target.anno <- lapply(target.gr, annotatePeak, TxDb=TxDb, assignGenomicAnnotation=FALSE, annoDb=NULL, verbose=FALSE) if (!is.null(distanceToTSS_cutoff)) { query.anno <- dropAnno(query.anno, distanceToTSS_cutoff) target.anno <- lapply(target.anno, dropAnno, distanceToTSS_cutoff = distanceToTSS_cutoff) } # ChIPseekerEnv <- get("ChIPseekerEnv", envir=.GlobalEnv) features <- get_cache_element(item = ChIPseekerCache, elements = "Transcripts") if(is.null(features)){ features <- transcriptsBy(TxDb) features <- unlist(features) update_cache_item(item = ChIPseekerCache, list("Transcripts" = features)) } # if ( exists("Transcripts", envir=ChIPseekerEnv, inherits=FALSE) ) { # features <- get("Transcripts", envir=ChIPseekerEnv) # } else { # features <- transcriptsBy(TxDb) # features <- unlist(features) # assign("Transcripts", features, envir=ChIPseekerEnv) # } ol <- lapply(target.anno, function(i) unique(intersect(as.GRanges(query.anno)$geneId, as.GRanges(i)$geneId))) oln <- unlist(lapply(ol, length)) N <- length(features) ## white ball m <- length(unique(as.GRanges(query.anno)$geneId)) ## black ball n <- N - m ## drawn k <- unlist(lapply(target.anno, function(i) length(unique(as.GRanges(i)$geneId)))) p <- phyper(oln, m, n, k, lower.tail=FALSE) if (is(queryPeak, "GRanges")) { qSample <- "queryPeak" } else { qSample <- basename(queryPeak) } if (is.null(targetFiles)) { tSample <- names(target.gr) if(is.null(tSample)) { tSample <- paste0("targetPeak", seq_along(target.gr)) } } else { tSample <- basename(targetFiles) } padj <- p.adjust(p, method=pAdjustMethod) res <- data.frame(qSample=qSample, tSample=tSample, qLen=length(unique(as.GRanges(query.anno)$geneId)), tLen=unlist(lapply(target.anno, function(i) length(unique(as.GRanges(i)$geneId)))), N_OL=oln, pvalue=p, p.adjust=padj) return(res) } ##' calculate overlap significant of ChIP experiments based on the genome coordinations ##' ##' ##' @title enrichPeakOverlap ##' @param queryPeak query bed file or GRanges object ##' @param targetPeak target bed file(s) or folder that containing bed files or a list of GRanges objects ##' @param TxDb TxDb ##' @param pAdjustMethod pvalue adjustment method ##' @param nShuffle shuffle numbers ##' @param chainFile chain file for liftOver ##' @param pool logical, whether pool target peaks ##' @param mc.cores number of cores, see \link[parallel]{mclapply} ##' @param verbose logical ##' @return data.frame ##' @export ##' @importFrom rtracklayer import.chain ##' @importFrom rtracklayer liftOver ##' @author G Yu enrichPeakOverlap <- function(queryPeak, targetPeak, TxDb=NULL, pAdjustMethod="BH", nShuffle=1000, chainFile=NULL, pool=TRUE, mc.cores=detectCores()-1, verbose=TRUE) { TxDb <- loadTxDb(TxDb) query.gr <- loadPeak(queryPeak) if (is(targetPeak[1], "GRanges") || is(targetPeak[[1]], "GRanges")) { target.gr <- targetPeak targetFiles <- NULL } else { targetFiles <- parse_targetPeak_Param(targetPeak) target.gr <- lapply(targetFiles, loadPeak) } if (!is.null(chainFile)) { chain <- import.chain(chainFile) target.gr <- lapply(target.gr, liftOver, chain=chain) } if (pool) { p.ol <- enrichOverlap.peak.internal(query.gr, target.gr, TxDb, nShuffle, mc.cores=mc.cores,verbose=verbose) } else { res_list <- lapply(1:length(target.gr), function(i) { enrichPeakOverlap(queryPeak = queryPeak, targetPeak = target.gr[i], TxDb = TxDb, pAdjustMethod = pAdjustMethod, nShuffle = nShuffle, chainFile = chainFile, mc.cores = mc.cores, verbose = verbose) }) res <- do.call("rbind", res_list) return(res) } if (is.null(p.ol$pvalue)) { p <- padj <- NA } else { p <- p.ol$pvalue padj <- p.adjust(p, method=pAdjustMethod) } ol <- p.ol$overlap if (is(queryPeak, "GRanges")) { qSample <- "queryPeak" } else { ## remove path, only keep file name qSample <- basename(queryPeak) } if (is.null(targetFiles)) { tSample <- names(target.gr) if(is.null(tSample)) { tSample <- paste0("targetPeak", seq_along(target.gr)) } } else { tSample <- basename(targetFiles) } res <- data.frame(qSample=qSample, tSample=tSample, qLen=length(query.gr), tLen=unlist(lapply(target.gr, length)), N_OL=ol, pvalue=p, p.adjust=padj) return(res) } ##' shuffle the position of peak ##' ##' ##' @title shuffle ##' @param peak.gr GRanges object ##' @param TxDb TxDb ##' @return GRanges object ##' @export ##' @author G Yu shuffle <- function(peak.gr, TxDb) { chrLens <- seqlengths(TxDb)[names(seqlengths(peak.gr))] nn <- as.vector(seqnames(peak.gr)) ii <- order(nn) w <- width(peak.gr) nnt <- table(nn) jj <- order(names(nnt)) nnt <- nnt[jj] chrLens <- chrLens[jj] ss <- unlist(sapply(1:length(nnt), function(i) sample(chrLens[i],nnt[i]))) res <- GRanges(seqnames=nn[ii], ranges=IRanges(ss, width=w[ii]), strand="*") return(res) } ##' @import GenomeInfoDb ##' @importFrom utils txtProgressBar ##' @importFrom utils setTxtProgressBar ##' @importFrom parallel mclapply ##' @importFrom parallel detectCores enrichOverlap.peak.internal <- function(query.gr, target.gr, TxDb, nShuffle=1000, mc.cores=detectCores()-1, verbose=TRUE) { if (verbose) { cat(">> permutation test of peak overlap...\t\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") } idx <- sample(1:length(target.gr), nShuffle, replace=TRUE) len <- unlist(lapply(target.gr, length)) if(Sys.info()[1] == "Windows") { qLen <- lapply(target.gr, function(tt) { length(intersect(query.gr, tt)) }) } else { qLen <- mclapply(target.gr, function(tt) { length(intersect(query.gr, tt)) }, mc.cores=mc.cores ) } qLen <- unlist(qLen) ## query ratio qr <- qLen/len if (nShuffle < 1) { res <- list(pvalue=NULL, overlap=qLen) return(res) } if (verbose) { pb <- txtProgressBar(min=0, max=nShuffle, style=3) } if(Sys.info()[1] == "Windows") { rr <- lapply(seq_along(idx), function(j) { if (verbose) { setTxtProgressBar(pb, j) } i <- idx[j] tarShuffle <- shuffle(target.gr[[i]], TxDb) length(intersect(query.gr, tarShuffle))/len[i] }) } else { rr <- mclapply(seq_along(idx), function(j) { if (verbose) { setTxtProgressBar(pb, j) } i <- idx[j] tarShuffle <- shuffle(target.gr[[i]], TxDb) length(intersect(query.gr, tarShuffle))/len[i] }, mc.cores=mc.cores ) } if (verbose) { close(pb) } rr <- unlist(rr) ## random ratio ## p <- lapply(qr, function(q) mean(rr>q)) p <- lapply(qr, function(q) (sum(rr>q)+1)/(length(rr)+1)) res <- list(pvalue=unlist(p), overlap=qLen) return(res) } ================================================ FILE: R/getFlankingGene.R ================================================ ##' @import IRanges ##' @importFrom dplyr mutate ##' @importFrom dplyr group_by getAllFlankingGene <- function(peak.gr, features, level="transcript", distance=5000) { peak.gr2 <- peak.gr start(ranges(peak.gr)) = start(ranges(peak.gr)) - distance end(ranges(peak.gr)) = end(ranges(peak.gr)) + distance hit <- findOverlaps(peak.gr, unstrand(features)) qh <- queryHits(hit) sh <- subjectHits(hit) featureHit <- features[sh] names(featureHit)=NULL hitInfo <- as.data.frame(featureHit) if (level == "transcript") { eg <- TXID2EG(featureHit$tx_id, geneIdOnly=TRUE) hitInfo$geneId <- eg } else { cn <- colnames(hitInfo) colnames(hitInfo)[cn == "gene_id"] <- "geneId" } hitInfo$peakIdx <- qh overlapHit <- findOverlaps(peak.gr2, unstrand(featureHit)) hitInfo$distance <- NA hitInfo$distance[subjectHits(overlapHit)] <- 0 psD <- ifelse(strand(featureHit) == "+", start(peak.gr2[qh]) - start(featureHit), end(featureHit)-end(peak.gr2[qh])) peD <- ifelse(strand(featureHit) == "+", end(peak.gr2[qh]) - start(featureHit), end(featureHit)-start(peak.gr2[qh])) idx <- abs(psD) > abs(peD) dd <- psD dd[idx] <- peD[idx] ii <- is.na(hitInfo$distance) hitInfo$distance[ii] <- dd[ii] peakIdx <- tx_name <- geneId <- distance <- NULL if (level == "transcript") { hitInfo2 <- group_by(hitInfo, peakIdx) %>% mutate(flank_txIds=paste(tx_name, collapse=";"), flank_geneIds=paste(geneId, collapse=";"), flank_gene_distances=paste(distance, collapse=";")) res <- hitInfo2[,c("peakIdx", "flank_txIds", "flank_geneIds", "flank_gene_distances")] res$flank_txIds <- as.character(res$flank_txIds) } else { hitInfo2 <- group_by(hitInfo, peakIdx) %>% mutate(flank_geneIds=paste(geneId, collapse=";"), flank_gene_distances=paste(distance, collapse=";")) res <- hitInfo2[,c("peakIdx", "flank_geneIds", "flank_gene_distances")] } res <- unique(res) res$flank_geneIds <- as.character(res$flank_geneIds) res$flank_gene_distances <- as.character(res$flank_gene_distances) return(res) } ================================================ FILE: R/getGenomicAnnotation.R ================================================ updateGenomicAnnotation <- function(peaks, genomicRegion, type, anno, sameStrand=FALSE) { hits <- getGenomicAnnotation.internal(peaks, genomicRegion, type, sameStrand=sameStrand) if (length(hits) > 1) { hitIndex <- hits$queryIndex anno[["annotation"]][hitIndex] <- hits$annotation anno[["detailGenomicAnnotation"]][hitIndex, type] <- TRUE } return(anno) } ##' get Genomic Annotation of peaks ##' ##' ##' @title getGenomicAnnotation ##' @param peaks peaks in GRanges object ##' @param distance distance of peak to TSS ##' @param tssRegion tssRegion, default is -3kb to +3kb ##' @param TxDb TxDb object ##' @param level one of gene or transcript ##' @param genomicAnnotationPriority genomic Annotation Priority ##' @param sameStrand whether annotate gene in same strand ##' @importFrom GenomicFeatures threeUTRsByTranscript ##' @importFrom GenomicFeatures fiveUTRsByTranscript ##' @importFrom yulab.utils get_cache_element ##' @importFrom yulab.utils update_cache_item ##' @return character vector ##' @author G Yu getGenomicAnnotation <- function(peaks, distance, tssRegion=c(-3000, 3000), TxDb, level, genomicAnnotationPriority, sameStrand = FALSE ) { ## ## since some annotation overlap, ## a priority is assign based on *genomicAnnotationPriority* ## use the following priority by default: ## ## 1. Promoter ## 2. 5' UTR ## 3. 3' UTR ## 4. Exon ## 5. Intron ## 6. Downstream ## 7. Intergenic ## .ChIPseekerEnv(TxDb, item = ChIPseekerCache) # ChIPseekerEnv <- get("ChIPseekerEnv", envir=.GlobalEnv) annotation <- rep(NA, length(distance)) flag <- rep(FALSE, length(distance)) detailGenomicAnnotation <- data.frame( genic=flag, Intergenic=flag, Promoter=flag, fiveUTR=flag, threeUTR=flag, Exon=flag, Intron=flag, downstream=flag, distal_intergenic=flag) anno <- list(annotation=annotation, detailGenomicAnnotation=detailGenomicAnnotation) genomicAnnotationPriority <- rev(genomicAnnotationPriority) for (AP in genomicAnnotationPriority) { if (AP == "Intron") { ## Introns # intronList <- get_intronList(ChIPseekerEnv) intronList <- get_intronList(item = ChIPseekerCache) anno <- updateGenomicAnnotation(peaks, intronList, "Intron", anno, sameStrand=sameStrand) } else if (AP == "Exon") { ## Exons # exonList <- get_exonList(ChIPseekerEnv) exonList <- get_exonList(item = ChIPseekerCache) anno <- updateGenomicAnnotation(peaks, exonList, "Exon", anno, sameStrand=sameStrand) } else if (AP == "3UTR") { ## 3' UTR Exons threeUTRList <- get_cache_element(item = ChIPseekerCache, elements = "threeUTRList") if(is.null(threeUTRList)){ threeUTRList <- threeUTRsByTranscript(TxDb) update_cache_item(item = ChIPseekerCache, list("threeUTRList" = threeUTRList)) } # if ( exists("threeUTRList", envir=ChIPseekerEnv, inherits=FALSE) ) { # threeUTRList <- get("threeUTRList", envir=ChIPseekerEnv) # } else { # threeUTRList <- threeUTRsByTranscript(TxDb) # assign("threeUTRList", threeUTRList, envir=ChIPseekerEnv) # } anno <- updateGenomicAnnotation(peaks, threeUTRList, "threeUTR", anno, sameStrand=sameStrand) } else if (AP == "5UTR") { ## 5' UTR Exons fiveUTRList <- get_cache_element(item = ChIPseekerCache, elements = "fiveUTRList") if(is.null(fiveUTRList)){ fiveUTRList <- fiveUTRsByTranscript(TxDb) update_cache_item(item = ChIPseekerCache, list("fiveUTRList" = fiveUTRList)) } # if ( exists("fiveUTRList", envir=ChIPseekerEnv, inherits=FALSE) ) { # fiveUTRList <- get("fiveUTRList", envir=ChIPseekerEnv) # } else { # fiveUTRList <- fiveUTRsByTranscript(TxDb) # assign("fiveUTRList", fiveUTRList, envir=ChIPseekerEnv) # } anno <- updateGenomicAnnotation(peaks, fiveUTRList, "fiveUTR", anno, sameStrand=sameStrand) } else if (AP == "Promoter") { annotation <- anno[["annotation"]] ## detailGenomicAnnotation <- anno[["detailGenomicAnnotation"]] ## TSS tssIndex <- distance >= tssRegion[1] & distance <= tssRegion[2] annotation[tssIndex] <- "Promoter" anno$detailGenomicAnnotation[tssIndex, "Promoter"] <- TRUE pm <- max(abs(tssRegion)) if (pm/1000 >= 2) { dd <- seq(1:ceiling(pm/1000))*1000 for (i in 1:length(dd)) { if (i == 1) { lbs <- paste("Promoter", " (<=", dd[i]/1000, "kb)", sep="") annotation[abs(distance) <= dd[i] & annotation == "Promoter"] <- lbs } else { lbs <- paste("Promoter", " (", dd[i-1]/1000, "-", dd[i]/1000, "kb)", sep="") annotation[abs(distance) <= dd[i] & abs(distance) > dd[i-1] & annotation == "Promoter"] <- lbs } } } anno[["annotation"]] <- annotation } else { ## Intergenic annotation[is.na(annotation)] <- "Intergenic" anno[["annotation"]] <- annotation } } annotation <- anno[["annotation"]] detailGenomicAnnotation <- anno[["detailGenomicAnnotation"]] genicIndex <- which(apply(detailGenomicAnnotation[, c("Exon", "Intron")], 1, any)) detailGenomicAnnotation[-genicIndex, "Intergenic"] <- TRUE detailGenomicAnnotation[genicIndex, "genic"] <- TRUE ## intergenicIndex <- anno[["annotation"]] == "Intergenic" ## anno[["detailGenomicAnnotation"]][intergenicIndex, "Intergenic"] <- TRUE ## anno[["detailGenomicAnnotation"]][!intergenicIndex, "genic"] <- TRUE features <- getGene(TxDb, by=level) ## nearest from gene end if (sameStrand) { idx <- follow(peaks, features) } else { idx <- follow(peaks, unstrand(features)) } na.idx <- which(is.na(idx)) if (length(na.idx)) { idx <- idx[-na.idx] peaks <- peaks[-na.idx] } peF <- features[idx] dd <- ifelse(strand(peF) == "+", start(peaks) - end(peF), end(peaks) - start(peF)) if (length(na.idx)) { dd2 <- numeric(length(idx) + length(na.idx)) dd2[-na.idx] <- dd } else { dd2 <- dd } dsd <- getOption("ChIPseeker.downstreamDistance") if (is.null(dsd)) dsd <- 3000 ## downstream 3k by default ## downstream within dsd if(dsd/1000<=1){ j <- which(annotation == "Intergenic" & abs(dd2) <= dsd & dd2 != 0) if(length(j)>0){ lbs <- paste("Downstream (<=", dsd, "bp)", sep="") annotation[j] <- lbs } }else{ ## downstream within 0-dsd/1000 kb for(i in 1:(dsd/1000)){ j <- which(annotation == "Intergenic" & abs(dd2) <= i*1000 & dd2 != 0) if (length(j) > 0){ if (i == 1){ lbs <- "Downstream (<1kb)" }else{ lbs <- paste("Downstream (", i-1, "-", i, "kb)", sep="") } annotation[j] <- lbs } } ## downstream (dsd/1000) kb - dsd bp z <- which(annotation == "Intergenic" & abs(dd2) <= dsd & dd2 != 0) if(length(z)>0){ lbs <- paste("Downstream (",dsd/1000,"kb-", dsd, "bp)", sep="") annotation[z] <- lbs } } annotation[which(annotation == "Intergenic")] = "Distal Intergenic" downstreamIndex <- dd2 > 0 & dd2 < dsd detailGenomicAnnotation[downstreamIndex, "downstream"] <- TRUE detailGenomicAnnotation[which(annotation == "Distal Intergenic"), "distal_intergenic"] <- TRUE return(list(annotation=annotation, detailGenomicAnnotation=detailGenomicAnnotation)) } ##' @import BiocGenerics S4Vectors IRanges getGenomicAnnotation.internal <- function(peaks, genomicRegion, type, sameStrand=FALSE){ GRegion <- unlist(genomicRegion) GRegionLen <- elementNROWS(genomicRegion) names(GRegionLen) <- names(genomicRegion) GRegion$gene_id <- rep(names(genomicRegion), times=GRegionLen) if (type == "Intron") { gr2 <- GRegion[!duplicated(GRegion$gene_id)] strd <- as.character(strand(gr2)) len <- GRegionLen[GRegionLen != 0] GRegion$intron_rank <- lapply(seq_along(strd), function(i) { rank <- seq(1, len[i]) if (strd[i] == '-') rank <- rev(rank) return(rank) }) %>% unlist } if (type == "Intron" || type =="Exon") { nn <- TXID2EG(names(genomicRegion)) names(GRegionLen) <- nn GRegion$gene_id <- rep(nn, times=GRegionLen) } ## find overlap if (sameStrand) { GRegionHit <- findOverlaps(peaks, GRegion) } else { GRegionHit <- findOverlaps(peaks, unstrand(GRegion)) } if (length(GRegionHit) == 0) { return(NA) } qh <- queryHits(GRegionHit) hit.idx <- getFirstHitIndex(qh) GRegionHit <- GRegionHit[hit.idx] queryIndex <- queryHits(GRegionHit) subjectIndex <- subjectHits(GRegionHit) hits <- GRegion[subjectIndex] geneID <- hits$gene_id if (type == "Intron") { anno <- paste(type, " (", geneID, ", intron ", hits$intron_rank, " of ", GRegionLen[geneID], ")", sep="") } else if (type == "Exon") { anno <- paste(type, " (", geneID, ", exon ", hits$exon_rank, " of ", GRegionLen[geneID], ")", sep="") } else if (type == "fiveUTR") { anno <- "5' UTR" } else if (type == "threeUTR") { anno <- "3' UTR" } else { anno <- type } res <- list(queryIndex=queryIndex, annotation=anno, gene=geneID) return(res) } ================================================ FILE: R/getNearestFeatureIndicesAndDistances.R ================================================ ##' get index of features that closest to peak and calculate distance ##' ##' ##' @title getNearestFeatureIndicesAndDistances ##' @param peaks peak in GRanges ##' @param features features in GRanges ##' @param sameStrand logical, whether find nearest gene in the same strand ##' @param ignoreOverlap logical, whether ignore overlap of TSS with peak ##' @param ignoreUpstream logical, if True only annotate gene at the 3' of the peak. ##' @param ignoreDownstream logical, if True only annotate gene at the 5' of the peak. ##' @param overlap one of "TSS" or "all" ##' @return list ##' @import BiocGenerics IRanges GenomicRanges ##' @author G Yu getNearestFeatureIndicesAndDistances <- function(peaks, features, sameStrand = FALSE, ignoreOverlap=FALSE, ignoreUpstream=FALSE, ignoreDownstream=FALSE, overlap = "TSS") { overlap <- match.arg(overlap, c("TSS", "all")) if (!ignoreOverlap && overlap == "all") { overlap_hit <- findOverlaps(peaks, unstrand(features)) } ## peaks only conatin all peak records, in GRanges object ## feature is the annotation in GRanges object ## only keep start position based on strand ## start(features) <- end(features) <- ifelse(strand(features) == "+", start(features), end(features)) features <- resize(features, width=1) # faster ## add dummy NA feature for peaks that are at the last or first feature ## suggested by Michael Kluge features.bak <- features seqlevels(features) <- c(seqlevels(features), "chrNA") dummy <- GRanges("chrNA", IRanges(1,1)) ## dummy$tx_id <- -1 ## dummy$tx_name <- "NA" cns <- names(mcols(features)) for (cn in cns) { if (grepl('id', cn)) { mcols(dummy)[[cn]] <- -1 } else { mcols(dummy)[[cn]] <- NA } } features <- append(features, dummy) dummyID <- length(features) if (sameStrand) { ## nearest from peak start ps.idx <- follow(peaks, features) ## nearest from peak end pe.idx <- precede(peaks, features) } else { ps.idx <- follow(peaks, unstrand(features)) pe.idx <- precede(peaks, unstrand(features)) } na.idx <- is.na(ps.idx) & is.na(pe.idx) if (sum(na.idx) > 0) { ## suggested by Thomas Schwarzl ps.idx <- ps.idx[!na.idx] pe.idx <- pe.idx[!na.idx] ##peaks <- peaks[!na.idx] } # set NA values to dummy value if only one entry is affected ps.idx[is.na(ps.idx)] <- dummyID pe.idx[is.na(pe.idx)] <- dummyID ## features from nearest peak start psF <- features[ps.idx] ## feature distances from peak start psD <- ifelse(strand(psF) == "+", 1, -1) * (start(peaks[!na.idx]) - start(psF)) psD[ps.idx == dummyID] <- Inf # ensure that there is even no match if a seq with name "chrNA" exists ## features from nearest peak end peF <- features[pe.idx] ## feature distances from peak end peD <- ifelse(strand(peF) == "+", 1, -1) * (end(peaks[!na.idx]) - start(peF)) peD[pe.idx == dummyID] <- Inf # ensure that there is even no match if a seq with name "chrNA" exists ## restore the old feature object features <- features.bak pse <- data.frame(ps=psD, pe=peD) if (ignoreUpstream) { j <- rep(2, nrow(pse)) } else if (ignoreDownstream) { j <- rep(1, nrow(pse)) } else { j <- apply(pse, 1, function(i) which.min(abs(i))) } ## index idx <- ps.idx idx[j==2] <- pe.idx[j==2] ## distance dd <- psD dd[j==2] <- peD[j==2] index <- distanceToTSS <- rep(NA, length(peaks)) distanceToTSS[!na.idx] <- dd index[!na.idx] <- idx if (!ignoreOverlap) { ## hit <- findOverlaps(peaks, unstrand(features)) if (overlap == "all") { hit <- overlap_hit if ( length(hit) != 0 ) { qh <- queryHits(hit) hit.idx <- getFirstHitIndex(qh) hit <- hit[hit.idx] peakIdx <- queryHits(hit) featureIdx <- subjectHits(hit) index[peakIdx] <- featureIdx distance_both_end <- data.frame(start=start(peaks[peakIdx]) - start(features[featureIdx]), end = end(peaks[peakIdx]) - start(features[featureIdx])) distance_idx <- apply(distance_both_end, 1, function(i) which.min(abs(i))) distance_minimal <- distance_both_end[,1] distance_minimal[distance_idx == 2] <- distance_both_end[distance_idx==2, 2] distanceToTSS[peakIdx] <- distance_minimal * ifelse(strand(features[featureIdx]) == "+", 1, -1) } } hit <- findOverlaps(peaks, unstrand(features)) if ( length(hit) != 0 ) { qh <- queryHits(hit) hit.idx <- getFirstHitIndex(qh) hit <- hit[hit.idx] peakIdx <- queryHits(hit) featureIdx <- subjectHits(hit) index[peakIdx] <- featureIdx distanceToTSS[peakIdx] <- 0 } } j <- is.na(distanceToTSS) | is.na(index) res <- list(index=index[!j], distance=distanceToTSS[!j], peak=peaks[!j]) return(res) } isPeakFeatureOverlap <- function(peak, feature) { peakRange <- ranges(peak) featureRange <- ranges(feature) x <- intersect(peakRange, featureRange) return(length(x) != 0) } ================================================ FILE: R/plotAnno.R ================================================ ##' plot feature distribution based on their chromosome region ##' ##' plot chromosome region features ##' @title plotAnnoBar.data.frame ##' @rdname plotAnnoBar ##' @param anno.df annotation stats ##' @param xlab xlab ##' @param ylab ylab ##' @param title plot title ##' @param categoryColumn category column ##' @return bar plot that summarize genomic features of peaks ##' @importFrom ggplot2 ggplot ##' @importFrom ggplot2 aes_string ##' @importFrom ggplot2 geom_bar ##' @importFrom ggplot2 coord_flip ##' @importFrom ggplot2 theme_bw ##' @importFrom ggplot2 scale_x_continuous ##' @importFrom ggplot2 scale_fill_manual ##' @importFrom ggplot2 xlab ##' @importFrom ggplot2 ylab ##' @importFrom ggplot2 ggtitle ##' @importFrom ggplot2 guide_legend ##' @seealso \code{\link{annotatePeak}} \code{\link{plotAnnoPie}} ##' @author Guangchuang Yu \url{https://yulab-smu.top} plotAnnoBar.data.frame <- function(anno.df, xlab="", ylab="Percentage(%)", title="Feature Distribution", categoryColumn) { anno.df$Feature <- factor(anno.df$Feature, levels = rev(levels(anno.df$Feature))) p <- ggplot(anno.df, aes_string(x = categoryColumn, fill = "Feature", y = "Frequency")) p <- p + geom_bar(stat="identity") + coord_flip() + theme_bw() p <- p + ylab(ylab) + xlab(xlab) + ggtitle(title) if (categoryColumn == 1) { p <- p + scale_x_continuous(breaks=NULL) p <- p+scale_fill_manual(values=rev(getCols(nrow(anno.df))), guide=guide_legend(reverse=TRUE)) } else { p <- p+scale_fill_manual(values=rev(getCols(length(unique(anno.df$Feature)))), guide=guide_legend(reverse=TRUE)) } return(p) } ##' pieplot from peak genomic annotation ##' ##' ##' @title plotAnnoPie ##' @rdname plotAnnoPie ##' @param x csAnno object ##' @param ndigit number of digit to round ##' @param cex label cex ##' @param col color ##' @param legend.position topright or other. ##' @param pie3D plot in 3D or not ##' @param radius radius of Pie ##' @param ... extra parameter ##' @return pie plot of peak genomic feature annotation ##' @examples ##' \dontrun{ ##' require(TxDb.Hsapiens.UCSC.hg19.knownGene) ##' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene ##' peakfile <- system.file("extdata", "sample_peaks.txt", package="chipseeker") ##' peakAnno <- annotatePeak(peakfile, TxDb=txdb) ##' plotAnnoPie(peakAnno) ##' } ##' @seealso \code{\link{annotatePeak}} \code{\link{plotAnnoBar}} ##' @export ##' @author Guangchuang Yu \url{https://yulab-smu.top} plotAnnoPie.csAnno <- function(x, ndigit=2, cex=0.8, col=NA, legend.position="rightside", pie3D=FALSE, radius=0.8, ...){ anno.df <- getAnnoStat(x) if (is.na(col[1])) { col <- getCols(nrow(anno.df)) } if (pie3D) annoPie3D(anno.df, ndigit=ndigit, cex=cex, col=col, ...) annoPie(anno.df, ndigit=ndigit, cex=cex, col=col, legend.position=legend.position, radius=radius, ...) } ##' @importFrom grDevices colorRampPalette ##' @importFrom graphics par ##' @importFrom graphics layout ##' @importFrom graphics pie ##' @importFrom graphics legend ##' @importFrom graphics plot.new annoPie <- function(anno.df, ndigit=2, cex=0.8, col=NA, legend.position, radius=0.8, ...) { if ( ! all(c("Feature", "Frequency") %in% colnames(anno.df))) { stop("check your input...") } if (legend.position == "rightside") { labels=paste(anno.df$Feature, " (", round(anno.df$Frequency/sum(anno.df$Frequency)*100, ndigit), "%)", sep="") par(mai = c(0,0,0,0)) layout(matrix(c(1,2), ncol=2), widths=c(0.6,0.4)) pie(anno.df$Frequency, labels=NA, cex=cex, col=col, ...) plot.new() legend("center", legend = labels, fill = col, bty = "n", cex = cex) } else { par(mai = c(0,0,0,0)) pie(anno.df$Frequency, ## ## labels=paste(round(anno.df$Frequency/sum(anno.df$Frequency)*100, 2), "%", sep=""), labels=paste(anno.df$Feature, " (", round(anno.df$Frequency/sum(anno.df$Frequency)*100, ndigit), "%)", sep=""), cex=cex, col=col, radius=radius, ... ) } } ## @param ndigit ndigit ## @param radius the radius of the pie ## @param explode the amount to "explode" the pie ## @param labelcex label font size ## @importFrom plotrix pie3D annoPie3D <- function(anno.df, ndigit=2, cex=1, ...){ ## anno.df <- getGenomicAnnoStat(peakAnno) pkg <- "plotrix" require(pkg, character.only=TRUE) pie3D <- eval(parse(text="pie3D")) pie3D(anno.df$Frequency, labels=paste( anno.df$Feature, "(", paste(round(anno.df$Frequency, ndigit), "%", sep=""), ")", sep=""), labelcex=cex, col=col, ...) } getGenomicAnnoStat <- function(peakAnno) { if(inherits(peakAnno,"GRanges")) peakAnno <- as.data.frame(peakAnno) anno <- peakAnno$annotation ## anno <- sub(" \\(.+", "", anno) e1 <- getOption("ChIPseeker.ignore_1st_exon") i1 <- getOption("ChIPseeker.ignore_1st_intron") ids <- getOption("ChIPseeker.ignore_downstream") if (is.null(e1) || !e1) { e1lab <- "1st Exon" anno[grep("exon 1 of", anno)] <- e1lab exonlab <- "Other Exon" } else { e1lab <- NULL exonlab <- "Exon" } if (is.null(i1) || !i1) { i1lab <- "1st Intron" anno[grep("intron 1 of", anno)] <- i1lab intronlab <- "Other Intron" } else { i1lab <- NULL intronlab <- "Intron" } anno[grep("Exon \\(", anno)] <- exonlab anno[grep("Intron \\(", anno)] <- intronlab if (is.null(ids) || !ids) { dsd <- getOption("ChIPseeker.downstreamDistance") if (is.null(dsd)) dsd <- 3000 ## downstream 3k by default if (dsd > 1000) { dsd <- round(dsd/1000, 1) dsd <- paste0(dsd, "kb") } dslab <- paste0("Downstream (<=", dsd, ")") anno[grep("Downstream", anno)] <- dslab iglab <- "Distal Intergenic" } else { dslab <- NULL iglab <- "Intergenic" anno[grep("Downstream", anno)] <- iglab } anno[grep("^Distal", anno)] <- iglab lvs <- c( "5' UTR", "3' UTR", e1lab, exonlab, i1lab, intronlab, dslab, iglab ) promoter <- unique(anno[grep("Promoter", anno)]) ip <- getOption("ChIPseeker.ignore_promoter_subcategory") if ((is.null(ip) || !ip) && (length(promoter) > 0)) { plab <- sort(as.character(promoter)) } else { plab <- "Promoter" anno[grep("^Promoter", anno)] <- plab } lvs <- c(plab, lvs) ## count frequency anno.table <- table(anno) ## calculate ratio anno.ratio <- anno.table/ sum(anno.table) * 100 anno.df <- as.data.frame(anno.ratio) colnames(anno.df) <- c("Feature", "Frequency") anno.df$Feature <- factor(anno.df$Feature, levels=lvs[lvs %in% anno.df$Feature]) anno.df <- anno.df[order(anno.df$Feature),] return(anno.df) } ================================================ FILE: R/plotDistToTSS.R ================================================ merge_two_si = function(x1, x2){ if (length(unique(gsub("^[0-9]+","",c(x1, x2)))) == 1){ return(paste0(gsub("[^0-9]*$","",x1), "-", x2)) } else { return(paste0(x1, "-", x2)) } } generate_break_lbs = function(breaks) { lbs = c() # break labels break_labels = scales::label_number(scale_cut = scales::cut_si(unit = "b"))(breaks) break_labels = gsub(" b$"," bp", break_labels) # category labels for (i in 2:length(breaks)) { if (i == length(breaks)) { lbs = c(lbs, paste0(">", break_labels[i-1])) } else { lbs = c(lbs, merge_two_si(break_labels[i-1], break_labels[i])) } } return(lbs) } generate_colors = function(palette = NULL, n) { # old color in version <= 1.41.1 old_color = c("#9ecae1", "#3182bd", "#C7A76C", "#86B875", "#39BEB1", "#CD99D8") if (is.null(palette)){ brewer_cols = old_color } else if (length(palette) == 1 && is_valid_palette(palette)){ brewer_cols = RColorBrewer::brewer.pal( name = palette, n = RColorBrewer::brewer.pal.info[palette, "maxcolors"] ) |> rev() } else if (all(is_valid_color(palette))){ brewer_cols = palette } else { warning("Your palette is non-valid, switching to default...") brewer_cols = old_color } if (length(brewer_cols) >= n) { cols = brewer_cols[1:length(brewer_cols)] } else { cols = grDevices::colorRampPalette(brewer_cols)(n) } return(cols) } is_valid_palette = function(palette){ palette %in% rownames(RColorBrewer::brewer.pal.info) } is_valid_color = function(color){ tryCatch({ grDevices::col2rgb(color) TRUE }, error = function(e) { FALSE }) } ##' plot feature distribution based on the distances to the TSS ##' ##' ##' @title plotDistToTSS.data.frame ##' @param peakDist peak annotation ##' @param distanceColumn column name of the distance from peak to nearest gene ##' @param distanceBreaks default is 'c(0, 1000, 3000, 5000, 10000, 100000)' ##' @param palette palette name for coloring different distances. Run `RColorBrewer::display.brewer.all()` to see all applicable values. ##' @param xlab x label ##' @param ylab y lable ##' @param title figure title ##' @param categoryColumn category column, default is ".id" ##' @return bar plot that summarize distance from peak to ##' TSS of the nearest gene. ##' @importFrom magrittr %<>% ##' @importFrom ggplot2 ggplot ##' @importFrom ggplot2 aes ##' @importFrom ggplot2 aes_string ##' @importFrom ggplot2 geom_bar ##' @importFrom ggplot2 geom_hline ##' @importFrom ggplot2 theme_bw ##' @importFrom ggplot2 coord_flip ##' @importFrom ggplot2 xlab ##' @importFrom ggplot2 ylab ##' @importFrom ggplot2 ggtitle ##' @importFrom ggplot2 geom_hline ##' @importFrom ggplot2 scale_y_continuous ##' @importFrom ggplot2 scale_x_continuous ##' @importFrom ggplot2 scale_fill_brewer ##' @importFrom ggplot2 scale_fill_hue ##' @importFrom ggplot2 scale_fill_manual ##' @importFrom ggplot2 geom_text ##' @importFrom rlang .data ##' @examples ##' \dontrun{ ##' require(TxDb.Hsapiens.UCSC.hg19.knownGene) ##' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene ##' peakfile <- system.file("extdata", "sample_peaks.txt", package="ChIPseeker") ##' peakAnno <- annotatePeak(peakfile, TxDb=txdb) ##' plotDistToTSS(peakAnno) ##' } ##' @seealso \code{\link{annotatePeak}} ##' @author Guangchuang Yu \url{https://guangchuangyu.github.io} plotDistToTSS.data.frame <- function(peakDist, distanceColumn="distanceToTSS", distanceBreaks=c(0, 1000, 3000, 5000, 10000, 100000), palette = NULL, xlab="", ylab="Binding sites (%) (5'->3')", title="Distribution of transcription factor-binding loci relative to TSS", categoryColumn = ".id") { distanceBreaks = sort(distanceBreaks) hasZero = sum(distanceBreaks == 0) if (!hasZero) distanceBreaks = c(0, distanceBreaks) hasInf = sum(is.infinite(distanceBreaks)) if (!hasInf) distanceBreaks = c(distanceBreaks, Inf) lbs = generate_break_lbs(distanceBreaks) peakDist$Feature = cut(abs(peakDist[[distanceColumn]]), breaks = distanceBreaks, labels = lbs, include.lowest = TRUE) ## sign containing -1 and 1 for upstream and downstream peakDist$sign <- sign(peakDist[,distanceColumn]) ## count frequencies if (categoryColumn == 1) { peakDist = peakDist |> summarise(freq = length(.data$Feature), .by = c("Feature", "sign")) |> mutate(freq = .data$freq/sum(.data$freq) * 100) } else { peakDist = peakDist |> summarise(freq = length(.data$Feature), .by = c(categoryColumn, "Feature", "sign")) |> mutate(freq = .data$freq/sum(.data$freq) * 100, .by = categoryColumn) } if (any(peakDist$sign == 0)) { zeroDist <- peakDist[peakDist$sign == 0,] zeroDist$freq <- zeroDist$freq/2 zeroDist$sign <- -1 peakDist[peakDist$sign == 0,] <- zeroDist zeroDist$sign <- 1 peakDist <- rbind(peakDist, zeroDist) } if (categoryColumn == 1) { peakDist %<>% group_by(.data$Feature, .data$sign) %>% summarise(freq = sum(.data$freq)) totalFreq <- peakDist %>% group_by(.data$sign) %>% summarise(total = sum(.data$freq)) } else { peakDist %<>% group_by(.data$.id, .data$Feature, .data$sign) %>% summarise(freq = sum(.data$freq)) totalFreq <- peakDist %>% group_by(.data$.id, .data$sign) %>% summarise(total = sum(.data$freq)) } ## preparing ylim and y tick labels ds = max(totalFreq$total[totalFreq$sign == 1]) dslim = ceiling(ds/10) * 10 us = max(totalFreq$total[totalFreq$sign == -1]) uslim = ceiling(us/10) * 10 ybreaks <- seq(-uslim, dslim, by=10) ylbs <- abs(ybreaks) ylbs[ylbs == 0] <- "TSS" peakDist$Feature <- factor(peakDist$Feature, levels=rev(levels(peakDist$Feature))) if (categoryColumn == 1) { p <- ggplot(peakDist, aes(x=1, fill=.data$Feature)) } else { p <- ggplot(peakDist, aes(x=.data[[categoryColumn]], fill=.data$Feature)) } p <- p + geom_bar(data=subset(peakDist, sign==1), aes(y=.data$freq), stat="identity") + geom_bar(data=subset(peakDist, sign==-1), aes(y=-.data$freq), stat="identity") p <- p + geom_hline(yintercept = 0, colour = "black") + coord_flip() + theme_bw() + scale_y_continuous(breaks=ybreaks,labels=ylbs) p <- p + ylab(ylab) + xlab(xlab) + ggtitle(title) if (categoryColumn == 1) { p <- p + scale_x_continuous(breaks=NULL) } cols <- generate_colors(palette = palette, n = length(lbs)) p <- p + scale_fill_manual(values=rev(cols), guide=guide_legend(reverse=TRUE)) return(p) } ================================================ FILE: R/plotTagMatrix.R ================================================ ##' plot the profile of peaks ##'` ##' \code{plotPeakProf_MultiWindows()} is almost the same as \code{plotPeakProf2()}, having ##' the main difference of accepting two or more granges objects. Accepting more ##' granges objects can help compare the same peaks in different windows. ##' ##' \code{TxDb} parameter can accept txdb object. ##' But many regions can not be obtained by txdb object. In this case, ##' Users can provide self-made granges served the same role ##' as txdb object and pass to \code{TxDb} object. ##' ##' \code{by} the features of interest. ##' ##' (1) if users use \code{txdb}, \code{by} can be one of 'gene', 'transcript', 'exon', ##' 'intron' , '3UTR' , '5UTR', 'UTR'. These features can be obtained by functions from txdb object. ##' ##' (2) if users use self-made granges object, \code{by} can be everything. Because this \code{by} ##' will not pass to functions to get features, which is different from the case of using ##' txdb object. This \code{by} is only used to made labels showed in picture. ##' ##' \code{type} means the property of the region. one of the "start site", ##' "end site" and "body". ##' ##' \code{upstream} and \code{downstream} parameter have different usages: ##' ##' (1) if \code{type == 'body'}, \code{upstream} and \code{downstream} can use to extend ##' the flank of body region. ##' ##' (2) if \code{type == 'start_site'/'end_site'}, \code{upstream} and \code{downstream} refer to ##' the upstream and downstream of the start_site or the end_site. ##' ##' \code{weightCol} refers to column in peak file. This column acts as a weight value. Details ##' see \url{https://github.com/YuLab-SMU/ChIPseeker/issues/15} ##' ##' \code{nbin} refers to the number of bins. \code{getTagMatrix()} provide a binning method ##' to get the tag matrix. ##' ##' There are two ways input a list of window. ##' ##' (1) Users can input a list of self-made granges objects ##' ##' (2) Users can input a list of \code{by} and only one \code{type}. In this way, ##' \code{plotPeakProf_MultiWindows()} can made a list of window from txdb object based on \code{by} and \code{type}. ##' ##' Warning: ##' ##' (1) All of these window should be the same type. It means users can only ##' compare a list of "start site"/"end site"/"body region" with the same upstream ##' and downstream. ##' ##' (2) So it will be only one \code{type} and several \code{by}. ##' ##' (3) Users can make window by txdb object or self-made granges object. Users can only ##' choose one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR' or 'UTR' in the ##' way of using txdb object. User can input any \code{by} in the way of using ##' self-made granges object. ##' ##' (4) Users can mingle the \code{by} designed for the two ways. \code{plotPeakProf_MultiWindows} can ##' accpet the hybrid \code{by}. But the above rules should be followed. ##' ##' \url{https://github.com/YuLab-SMU/ChIPseeker/issues/189} ##' ##' @title plotPeakProf_MultiWindows ##' ##' @param tagMatrix tagMatrix or a list of tagMatrix ##' @param peak peak file or GRanges object ##' @param weightCol column name of weight ##' @param TxDb TxDb object or self-made granges objects ##' @param upstream upstream position ##' @param downstream downstream position ##' @param by feature of interest ##' @param type one of "start_site", "end_site", "body" ##' @param windows_name the name for each window, which will also be showed in the picture as labels ##' @param xlab xlab ##' @param ylab ylab ##' @param conf confidence interval ##' @param facet one of 'none', 'row' and 'column' ##' @param free_y if TRUE, y will be scaled by AvgProf ##' @param verbose print message or not ##' @param nbin the amount of bines ##' @param ignore_strand ignore the strand information or not ##' @param ... additional parameter ##' @return ggplot object ##' @importFrom methods is ##' @importFrom methods as ##' @importFrom methods missingArg ##' @importFrom methods new ##' @export plotPeakProf <- function(tagMatrix = NULL, peak, upstream, downstream, conf, by, type, windows_name = NULL, weightCol = NULL, TxDb = NULL, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", facet = "row", free_y = TRUE, verbose = TRUE, nbin = NULL, ignore_strand = FALSE, ...){ if(is.null(tagMatrix)){ conf <- if(missingArg(conf)) NA else conf upstream <- if(missingArg(upstream)) NULL else upstream downstream <- if(missingArg(downstream)) NULL else downstream if(length(by) == 1){ plotPeakProf2(peak = peak, upstream = upstream, downstream = downstream, conf = conf, by = by, type = type, weightCol = weightCol, TxDb = TxDb, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, verbose = verbose, nbin = nbin, ignore_strand = ignore_strand, ...) }else{ if(is.null(windows_name) && !is.null(names(TxDb))) windows_name <- names(TxDb) plotPeakProf_MultiWindows(peak = peak, upstream = upstream, downstream = downstream, conf = conf, by = by, type = type, windows_name = windows_name, weightCol = weightCol, TxDb = TxDb, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, verbose = verbose, nbin = nbin, ignore_strand = ignore_strand, ...) } }else{ if(is(tagMatrix, "list")){ upstream <- attr(tagMatrix[[1]], 'upstream') downstream <- attr(tagMatrix[[1]], 'downstream') label <- attr(tagMatrix[[1]], 'label') attr(tagMatrix, 'type') <- attr(tagMatrix[[1]], 'type') attr(tagMatrix, 'is.binning') <- attr(tagMatrix[[1]], 'is.binning') }else{ upstream <- attr(tagMatrix, 'upstream') downstream <- attr(tagMatrix, 'downstream') label <- attr(tagMatrix, 'label') } if(attr(tagMatrix, 'is.binning')){ if (!(missingArg(conf) || is.na(conf))){ plotAvgProf.binning(tagMatrix = tagMatrix, xlab = xlab, ylab = ylab, conf = conf, facet = facet, free_y = free_y, upstream = upstream, downstream = downstream, label = label, ...) }else{ plotAvgProf.binning(tagMatrix = tagMatrix, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, upstream = upstream, downstream = downstream, label = label, ...) } }else{ xlim <- c(-upstream, downstream) if (!(missingArg(conf) || is.na(conf))){ plotAvgProf (tagMatrix = tagMatrix, xlim = xlim, xlab = xlab, ylab = ylab, conf = conf, facet = facet, free_y = free_y, origin_label = label, ...) }else{ plotAvgProf (tagMatrix = tagMatrix, xlim = xlim, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, origin_label = label, ...) } } } } ##' plot the profile of peaks ##' ##' ##' @title plotAvgProf ##' @param tagMatrix tagMatrix or a list of tagMatrix ##' @param xlim xlim ##' @param xlab x label ##' @param ylab y label ##' @param conf confidence interval ##' @param facet one of 'none', 'row' and 'column' ##' @param free_y if TRUE, y will be scaled by AvgProf ##' @param origin_label label of the center ##' @param verbose print message or not ##' @param ... additional parameter ##' @return ggplot object ##' @author G Yu; Y Yan ##' @export plotAvgProf <- function(tagMatrix, xlim, xlab="Genomic Region (5'->3')", ylab = "Peak Count Frequency", conf, facet="none", free_y = TRUE, origin_label = "TSS", verbose = TRUE, ...) { ## S4Vectors change the behavior of ifelse ## see https://support.bioconductor.org/p/70871/ ## ## conf <- ifelse(missingArg(conf), NA, conf) if (verbose) { cat(">> plotting figure...\t\t\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") } conf <- if(missingArg(conf)) NA else conf if (!(missingArg(conf) || is.na(conf))){ p <- plotAvgProf.internal(tagMatrix = tagMatrix, conf = conf, xlim = xlim, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, origin_label = origin_label, ...) } else { p <- plotAvgProf.internal(tagMatrix, xlim = xlim, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, origin_label = origin_label, ...) } return(p) } ##' @importFrom ggplot2 ggplot ##' @importFrom ggplot2 geom_line ##' @importFrom ggplot2 geom_vline ##' @importFrom ggplot2 geom_ribbon ##' @importFrom ggplot2 scale_x_continuous ##' @importFrom ggplot2 scale_color_manual ##' @importFrom ggplot2 xlab ##' @importFrom ggplot2 ylab ##' @importFrom ggplot2 theme_bw ##' @importFrom ggplot2 theme ##' @importFrom ggplot2 element_blank ##' @importFrom ggplot2 facet_grid plotAvgProf.internal <- function(tagMatrix, conf, xlim = c(-3000,3000), xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", facet="none", free_y = TRUE, origin_label, ...) { listFlag <- FALSE if (is(tagMatrix, "list")) { if ( is.null(names(tagMatrix)) ) { nn <- paste0("peak", seq_along(tagMatrix)) warning("input is not a named list, set the name automatically to ", paste(nn, collapse=' ')) names(tagMatrix) <- nn ## stop("tagMatrix should be a named list...") } listFlag <- TRUE } if ( listFlag ) { facet <- match.arg(facet, c("none", "row", "column")) if ( (xlim[2]-xlim[1]+1) != ncol(tagMatrix[[1]]) ) { stop("please specify appropreate xcoordinations...") } } else { if ( (xlim[2]-xlim[1]+1) != ncol(tagMatrix) ) { stop("please specify appropreate xcoordinations...") } } ## S4Vectors change the behavior of ifelse ## see https://support.bioconductor.org/p/70871/ ## ## conf <- ifelse(missingArg(conf), NA, conf) ## conf <- if(missingArg(conf)) NA else conf pos <- value <- .id <- Lower <- Upper <- NULL if ( listFlag ) { tagCount <- lapply(tagMatrix, function(x) getTagCount(x, xlim = xlim, conf = conf, ...)) tagCount <- list_to_dataframe(tagCount) tagCount$.id <- factor(tagCount$.id, levels=names(tagMatrix)) p <- ggplot(tagCount, aes(pos, group=.id, color=.id)) if (!(is.na(conf))) { p <- p + geom_ribbon(aes(ymin = Lower, ymax = Upper, fill = .id), linetype = 0, alpha = 0.2) } } else { tagCount <- getTagCount(tagMatrix, xlim = xlim, conf = conf, ...) p <- ggplot(tagCount, aes(pos)) if (!(is.na(conf))) { p <- p + geom_ribbon(aes(ymin = Lower, ymax = Upper), linetype = 0, alpha = 0.2) } } p <- p + geom_line(aes(y = value)) if ( 0 > xlim[1] && 0 < xlim[2] ) { p <- p + geom_vline(xintercept=0, linetype="longdash") p <- p + scale_x_continuous(breaks=c(xlim[1], floor(xlim[1]/2), 0, floor(xlim[2]/2), xlim[2]), labels=c(paste0(xlim[1],"bp"), paste0(floor(xlim[1]/2),"bp"), origin_label, paste0(floor(xlim[2]/2),"bp"), paste0(xlim[2], "bp"))) } if (listFlag) { cols <- getCols(length(tagMatrix)) p <- p + scale_color_manual(values=cols) if (facet == "row") { if (free_y) { p <- p + facet_grid(.id ~ ., scales = "free_y") } else { p <- p + facet_grid(.id ~ .) } } else if (facet == "column") { if (free_y) { p <- p + facet_grid(. ~ .id, scales = "free_y") } else { p <- p + facet_grid(. ~ .id) } } } p <- p+xlab(xlab)+ylab(ylab) p <- p + theme_bw() + theme(legend.title=element_blank()) if(facet != "none") { p <- p + theme(legend.position="none") } return(p) } ##' plot the profile of peaks that align to flank sequences of TSS ##' ##' This function is the old function of \code{plotPeakProf2}. It can ##' only plot the start site region of gene. ##' ##' @title plotAvgProf ##' @param peak peak file or GRanges object ##' @param weightCol column name of weight ##' @param TxDb TxDb object ##' @param upstream upstream position ##' @param downstream downstream position ##' @param xlab xlab ##' @param ylab ylab ##' @param conf confidence interval ##' @param facet one of 'none', 'row' and 'column' ##' @param free_y if TRUE, y will be scaled by AvgProf ##' @param verbose print message or not ##' @param ignore_strand ignore the strand information or not ##' @param ... additional parameter ##' @return ggplot object ##' @export ##' @author G Yu, Ming L plotAvgProf2 <- function(peak, weightCol = NULL, TxDb = NULL, upstream = 1000, downstream = 1000, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", conf, facet = "none", free_y = TRUE, verbose = TRUE, ignore_strand = FALSE, ...) { plotPeakProf2(peak = peak, upstream = upstream, downstream = downstream, conf, by = "gene", type = "start_site", weightCol = weightCol, TxDb = TxDb, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, verbose = verbose, ignore_strand = ignore_strand, ...) } ##' plot the profile of peaks by binning ##' ##' ##' @title plotAvgProf.binning ##' @param tagMatrix tagMatrix or a list of tagMatrix ##' @param xlab x label ##' @param ylab y label ##' @param conf confidence interval ##' @param facet one of 'none', 'row' and 'column' ##' @param free_y if TRUE, y will be scaled ##' @param upstream rel object reflects the percentage of flank extension, e.g rel(0.2) ##' integer reflects the actual length of flank extension or TSS region ##' NULL reflects the gene body with no extension ##' @param downstream rel object reflects the percentage of flank extension, e.g rel(0.2) ##' integer reflects the actual length of flank extension or TSS region ##' NULL reflects the gene body with no extension ##' @param label label ##' @param ... additional parameter ##' @return ggplot object ##' @importFrom ggplot2 rel plotAvgProf.binning <- function(tagMatrix, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", conf, facet ="none", free_y = TRUE, upstream = NULL, downstream = NULL, label, ...) { ## S4Vectors change the behavior of ifelse ## see https://support.bioconductor.org/p/70871/ ## ## conf <- ifelse(missingArg(conf), NA, conf) conf <- if(missingArg(conf)) NA else conf if (!(missingArg(conf) || is.na(conf))){ p <- plotAvgProf.binning.internal(tagMatrix , conf = conf, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, upstream = upstream, downstream = downstream, label = label, ...) } else { p <- plotAvgProf.binning.internal(tagMatrix , xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, upstream = upstream, downstream = downstream, label = label, ...) } return(p) } ##' @importFrom ggplot2 ggplot ##' @importFrom ggplot2 geom_line ##' @importFrom ggplot2 geom_vline ##' @importFrom ggplot2 geom_ribbon ##' @importFrom ggplot2 scale_x_continuous ##' @importFrom ggplot2 scale_color_manual ##' @importFrom ggplot2 xlab ##' @importFrom ggplot2 ylab ##' @importFrom ggplot2 theme_bw ##' @importFrom ggplot2 theme ##' @importFrom ggplot2 element_blank ##' @importFrom ggplot2 facet_grid ##' @importFrom ggplot2 rel plotAvgProf.binning.internal <- function(tagMatrix, conf, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", facet="none", free_y = TRUE, upstream = NULL, downstream = NULL, label, ...) { listFlag <- FALSE if (is(tagMatrix, "list")) { if ( is.null(names(tagMatrix )) ) { nn <- paste0("peak", seq_along(tagMatrix )) warning("input is not a named list, set the name automatically to ", paste(nn, collapse=' ')) names(tagMatrix) <- nn ## stop("tagMatrix should be a named list...") } listFlag <- TRUE } if(listFlag){ nbin <- dim(tagMatrix[[1]])[2] }else{ nbin <- dim(tagMatrix)[2] } xlim <- c(1,nbin) if ( listFlag ) { facet <- match.arg(facet, c("none", "row", "column")) } ## S4Vectors change the behavior of ifelse ## see https://support.bioconductor.org/p/70871/ ## ## conf <- ifelse(missingArg(conf), NA, conf) ## conf <- if(missingArg(conf)) NA else conf pos <- value <- .id <- Lower <- Upper <- NULL if ( listFlag ) { tagCount <- lapply(tagMatrix , function(x) getTagCount(x, xlim = xlim, conf = conf, ...)) tagCount <- list_to_dataframe(tagCount) tagCount$.id <- factor(tagCount$.id, levels=names(tagMatrix )) p <- ggplot(tagCount, aes(pos, group=.id, color=.id)) if (!(is.na(conf))) { p <- p + geom_ribbon(aes(ymin = Lower, ymax = Upper, fill = .id), linetype = 0, alpha = 0.2) } } else { tagCount <- getTagCount(tagMatrix , xlim = xlim, conf = conf, ...) p <- ggplot(tagCount, aes(pos)) if (!(is.na(conf))) { p <- p + geom_ribbon(aes(ymin = Lower, ymax = Upper), linetype = 0, alpha = 0.2) } } p <- p + geom_line(aes(y = value)) ## x_scale for genebody if(attr(tagMatrix, 'type') == 'body'){ ## x_scale for gene body with no flank extension if(is.null(upstream)){ p <- p + scale_x_continuous(breaks=c(1, floor(nbin*0.25), floor(nbin*0.5), floor(nbin*0.75), nbin), labels=c(label[1], "25%", "50%", "75%", label[2])) } ## x_scale for flank extension by relative value if(inherits(upstream, 'rel')){ p <- p + scale_x_continuous(breaks=c(1, floor(nbin*(as.numeric(upstream)*100/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+25)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+50)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+75)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+100)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), nbin), labels=c(paste0("-",as.numeric(upstream)*100,"%"), label[1], "25%", "50%", "75%", label[2], paste0("+",as.numeric(downstream)*100,"%"))) p <- p + geom_vline(xintercept=floor(nbin*(as.numeric(upstream)*100/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), linetype="longdash") p <- p + geom_vline(xintercept=floor(nbin*((as.numeric(upstream)*100+100)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), linetype="longdash") } ## x_scale for flank extension by absolute value if(!is.null(upstream) & !inherits(upstream, 'rel')){ upstreamPer <- floor(upstream/1000)*0.1 downstreamPer <- floor(downstream/1000)*0.1 p <- p + scale_x_continuous(breaks=c(1, floor(nbin*(upstreamPer/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+0.25)/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+0.5)/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+0.75)/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+1)/(1+upstreamPer+downstreamPer))), nbin), labels=c(paste0("-",upstream,"bp"), label[1], "25%", "50%", "75%", label[2], paste0(downstream,"bp"))) p <- p + geom_vline(xintercept=floor(nbin*(upstreamPer/(1+upstreamPer+downstreamPer))), linetype="longdash") p <- p + geom_vline(xintercept=floor(nbin*((upstreamPer+1)/(1+upstreamPer+downstreamPer))), linetype="longdash") } } ## x_scale for start region if(attr(tagMatrix, 'type') != 'body'){ p <- p + scale_x_continuous(breaks=c(1, floor(nbin*0.25), floor(nbin*0.5), floor(nbin*0.75), nbin), labels=c(paste0("-",upstream,"bp"), paste0("-",floor(upstream*0.5),"bp"), label, paste0(floor(downstream*0.5),"bp"), paste0(downstream,"bp"))) p <- p + geom_vline(xintercept=floor(nbin*0.5), linetype="longdash") } if (listFlag) { cols <- getCols(length(tagMatrix)) p <- p + scale_color_manual(values=cols) if (facet == "row") { if (free_y) { p <- p + facet_grid(.id ~ ., scales = "free_y") } else { p <- p + facet_grid(.id ~ .) } } else if (facet == "column") { if (free_y) { p <- p + facet_grid(. ~ .id, scales = "free_y") } else { p <- p + facet_grid(. ~ .id) } } } p <- p+xlab(xlab)+ylab(ylab) p <- p + theme_bw() + theme(legend.title=element_blank()) if(facet != "none") { p <- p + theme(legend.position="none") } return(p) } ##' plot the profile of peaks automatically ##' ##' \code{peak} stands for the peak file. ##' ##' \code{by} the features of interest. ##' ##' (1) if users use \code{txdb}, \code{by} can be one of 'gene', 'transcript', 'exon', ##' 'intron' , '3UTR' , '5UTR', 'UTR'. These features can be obtained by functions from txdb object. ##' ##' (2) if users use self-made granges object, \code{by} can be everything. Because this \code{by} ##' will not pass to functions to get features, which is different from the case of using ##' txdb object. This \code{by} is only used to made labels showed in picture. ##' ##' \code{type} means the property of the region. one of the "start site", ##' "end site" and "body". ##' ##' \code{upstream} and \code{downstream} parameter have different usages: ##' ##' (1) if \code{type == 'body'}, \code{upstream} and \code{downstream} can use to extend ##' the flank of body region. ##' ##' (2) if \code{type == 'start_site'/'end_site'}, \code{upstream} and \code{downstream} refer to ##' the upstream and downstream of the start_site or the end_site. ##' ##' \code{weightCol} refers to column in peak file. This column acts as a weight vaule. Details ##' see \url{https://github.com/YuLab-SMU/ChIPseeker/issues/15} ##' ##' \code{nbin} refers to the number of bins, providing a binning method ##' to get the tag matrix. ##' ##' \code{TxDb} parameter can accept txdb object. ##' But many regions can not be obtained by txdb object. In this case, ##' Users can provide self-made granges served the same role ##' as txdb object and pass to \code{TxDb} object. ##' ##' \code{plotPeakProf2()} is different from the \code{plotPeakProf()}. \code{plotPeakProf2()} do not ##' need to provide \code{window} parameter, which means \code{plotPeakProf2()} will call relevent ##' functions to make \code{window} automatically. ##' ##' @title plotPeakProf2 ##' @param peak peak file or GRanges object ##' @param weightCol column name of weight ##' @param TxDb TxDb object, or self-made granges object ##' @param upstream upstream position ##' @param downstream downstream position ##' @param by e.g. 'gene', 'transcript', 'exon' or features of interest(e.g. "enhancer") ##' @param type one of "start_site", "end_site", "body" ##' @param xlab xlab ##' @param ylab ylab ##' @param conf confidence interval ##' @param facet one of 'none', 'row' and 'column' ##' @param free_y if TRUE, y will be scaled by AvgProf ##' @param verbose print message or not ##' @param nbin the amount of nbines ##' @param ignore_strand ignore the strand information or not ##' @param ... additional parameter ##' @return ggplot object ##' @export ##' @author G Yu, Ming Li plotPeakProf2 <- function(peak, upstream, downstream, conf, by, type, weightCol = NULL, TxDb = NULL, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", facet = "none", free_y = TRUE, verbose = TRUE, nbin = NULL, ignore_strand = FALSE, ...){ conf <- if(missingArg(conf)) NA else conf upstream <- if(missingArg(upstream)) NULL else upstream downstream <- if(missingArg(downstream)) NULL else downstream if ( is(peak, "list") ) { tagMatrix <- lapply(peak, getTagMatrix, upstream = upstream, downstream = downstream, type = type, TxDb = TxDb, by = by, weightCol = weightCol, nbin = nbin, verbose = verbose, ignore_strand = ignore_strand) } else { tagMatrix <- getTagMatrix(peak = peak, upstream = upstream, downstream = downstream, type = type, by = by, TxDb = TxDb, weightCol = weightCol, nbin = nbin, verbose = verbose, ignore_strand = ignore_strand) } if (!(missingArg(conf) || is.na(conf))){ p <- plotPeakProf(tagMatrix = tagMatrix, conf = conf, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, ...) } else { p <- plotPeakProf(tagMatrix = tagMatrix, xlab = xlab, ylab = ylab, facet= facet, free_y = free_y, ...) } return(p) } ##' plot the profile of peaks in two or more windows ##' ##' ##' This function comes from \url{https://github.com/YuLab-SMU/ChIPseeker/issues/189} ##'` ##' \code{plotPeakProf_MultiWindows()} is almost the same as \code{plotPeakProf2()}, having ##' the main difference of accepting two or more granges objects. Accepting more ##' granges objects can help compare the same peaks in different windows. ##' ##' \code{TxDb} parameter can accept txdb object. ##' But many regions can not be obtained by txdb object. In this case, ##' Users can provide self-made granges served the same role ##' as txdb object and pass to \code{TxDb} object. ##' ##' \code{by} the features of interest. ##' ##' (1) if users use \code{txdb}, \code{by} can be one of 'gene', 'transcript', 'exon', ##' 'intron' , '3UTR' , '5UTR', 'UTR'. These features can be obtained by functions from txdb object. ##' ##' (2) if users use self-made granges object, \code{by} can be everything. Because this \code{by} ##' will not pass to functions to get features, which is different from the case of using ##' txdb object. This \code{by} is only used to made labels showed in picture. ##' ##' \code{type} means the property of the region. one of the "start site", ##' "end site" and "body". ##' ##' \code{upstream} and \code{downstream} parameter have different usages: ##' ##' (1) if \code{type == 'body'}, \code{upstream} and \code{downstream} can use to extend ##' the flank of body region. ##' ##' (2) if \code{type == 'start_site'/'end_site'}, \code{upstream} and \code{downstream} refer to ##' the upstream and downstream of the start_site or the end_site. ##' ##' \code{weightCol} refers to column in peak file. This column acts as a weight value. Details ##' see \url{https://github.com/YuLab-SMU/ChIPseeker/issues/15} ##' ##' \code{nbin} refers to the number of bins. \code{getTagMatrix()} provide a binning method ##' to get the tag matrix. ##' ##' There are two ways input a list of window. ##' ##' (1) Users can input a list of self-made granges objects ##' ##' (2) Users can input a list of \code{by} and only one \code{type}. In this way, ##' \code{plotPeakProf_MultiWindows()} can made a list of window from txdb object based on \code{by} and \code{type}. ##' ##' Warning: ##' ##' (1) All of these window should be the same type. It means users can only ##' compare a list of "start site"/"end site"/"body region" with the same upstream ##' and downstream. ##' ##' (2) So it will be only one \code{type} and several \code{by}. ##' ##' (3) Users can make window by txdb object or self-made granges object. Users can only ##' choose one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR' or 'UTR' in the ##' way of using txdb object. User can input any \code{by} in the way of using ##' self-made granges object. ##' ##' (4) Users can mingle the \code{by} designed for the two ways. \code{plotPeakProf_MultiWindows} can ##' accpet the hybrid \code{by}. But the above rules should be followed. ##' ##' ##' @title plotPeakProf_MultiWindows ##' @param peak peak file or GRanges object ##' @param weightCol column name of weight ##' @param TxDb TxDb object or self-made granges objects ##' @param upstream upstream position ##' @param downstream downstream position ##' @param by feature of interest ##' @param type one of "start_site", "end_site", "body" ##' @param windows_name the name for each window, which will also be showed in the picture as labels ##' @param xlab xlab ##' @param ylab ylab ##' @param conf confidence interval ##' @param facet one of 'none', 'row' and 'column' ##' @param free_y if TRUE, y will be scaled by AvgProf ##' @param verbose print message or not ##' @param nbin the amount of bines ##' @param ignore_strand ignore the strand information or not ##' @param ... additional parameter ##' @return ggplot object plotPeakProf_MultiWindows <- function(peak, upstream, downstream, conf, by, type, windows_name = NULL, weightCol = NULL, TxDb = NULL, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", facet = "row", free_y = TRUE, verbose = TRUE, nbin = NULL, ignore_strand = FALSE, ...){ conf <- if(missingArg(conf)) NA else conf upstream <- if(missingArg(upstream)) NULL else upstream downstream <- if(missingArg(downstream)) NULL else downstream ## check type if(length(type) != 1){ stop("It should be only one type...") } ## make the window name if (is.null(windows_name)) { nn <- by warning("set the name automatically to ", paste(nn, collapse=' ')) windows_name <- nn }else{ if (length(windows_name) != length(by)) { stop("the length of the window name and the by should be equal...") } } if ( is(peak, "list") ) { tagMatrix <- lapply(peak, getTagMatrix2, upstream=upstream, downstream=downstream, windows_name=windows_name, type=type, by=by, TxDb=TxDb, weightCol = weightCol, nbin = nbin, verbose = verbose, ignore_strand= ignore_strand) } else { tagMatrix <- getTagMatrix2(peak=peak, upstream=upstream, downstream=downstream, windows_name=windows_name, type=type, by=by, TxDb=TxDb, weightCol = weightCol, nbin = nbin, verbose = verbose, ignore_strand= ignore_strand) } if (!(missingArg(conf) || is.na(conf))){ p <- plotMultiProf(tagMatrix = tagMatrix, conf = conf, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, ...) } else { p <- plotMultiProf(tagMatrix = tagMatrix, xlab = xlab, ylab = ylab, facet= facet, free_y = free_y, ...) } return(p) } ##' internal function for plotPeakProf_MultiWindows ##' ##' @param tagMatrix tagMatrix ##' @param xlab xlab ##' @param ylab ylab ##' @param conf confidence interval ##' @param facet one of 'none', 'row' and 'column' ##' @param free_y if TRUE, y will be scaled by AvgProf ##' @param ... additional parameter plotMultiProf <- function(tagMatrix, conf, xlab="Genomic Region (5'->3')", ylab = "Peak Count Frequency", facet="none", free_y = TRUE, ...){ if(is(tagMatrix[[1]][[1]],"matrix")){ upstream <- attr(tagMatrix[[1]][[1]], 'upstream') downstream <- attr(tagMatrix[[1]][[1]], 'downstream') # attr(tagMatrix, 'type') <- attr(tagMatrix[[1]][[1]], 'type') # attr(tagMatrix, 'is.binning') <- attr(tagMatrix[[1]][[1]], 'is.binning') binFlag <- attr(tagMatrix[[1]][[1]], 'is.binning') type <- attr(tagMatrix[[1]][[1]], 'type') }else{ upstream <- attr(tagMatrix[[1]], 'upstream') downstream <- attr(tagMatrix[[1]], 'downstream') binFlag <- attr(tagMatrix[[1]], 'is.binning') type <- attr(tagMatrix[[1]], 'type') } if(type == "body"){ label <- c("SS","TS") }else if(type == "start_site"){ label <- "SS" }else{ label <- "TS" } if(binFlag){ if (!(missingArg(conf) || is.na(conf))){ plotMultiProf.binning(tagMatrix = tagMatrix, xlab = xlab, ylab = ylab, conf = conf, facet = facet, free_y = free_y, upstream = upstream, downstream = downstream, label = label, ...) }else{ plotMultiProf.binning(tagMatrix = tagMatrix, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, upstream = upstream, downstream = downstream, label = label, ...) } }else{ xlim <- c(-upstream, downstream) if (!(missingArg(conf) || is.na(conf))){ plotMultiProf.normal(tagMatrix = tagMatrix, xlim = xlim, xlab = xlab, ylab = ylab, conf = conf, facet = facet, free_y = free_y, origin_label = label, ...) }else{ plotMultiProf.normal(tagMatrix = tagMatrix, xlim = xlim, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, origin_label = label, ...) } } } ##' internal function ##' ##' @param tagMatrix tagMatrix ##' @param xlim xlim ##' @param xlab xlab ##' @param ylab ylab ##' @param conf confidence interval ##' @param facet one of 'none', 'row' and 'column' ##' @param free_y if TRUE, y will be scaled by AvgProf ##' @param origin_label the label of the center ##' @param verbose print message or not ##' @param ... additional parameter plotMultiProf.normal <- function(tagMatrix, xlim, xlab="Genomic Region (5'->3')", ylab = "Peak Count Frequency", conf, facet="none", free_y = TRUE, origin_label = "TSS", verbose = TRUE, ...) { ## S4Vectors change the behavior of ifelse ## see https://support.bioconductor.org/p/70871/ ## ## conf <- ifelse(missingArg(conf), NA, conf) if (verbose) { cat(">> plotting figure...\t\t\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") } conf <- if(missingArg(conf)) NA else conf if (!(missingArg(conf) || is.na(conf))){ p <- plotMultiProf.normal.internal(tagMatrix = tagMatrix, conf = conf, xlim = xlim, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, origin_label = origin_label, ...) } else { p <- plotMultiProf.normal.internal(tagMatrix, xlim = xlim, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, origin_label = origin_label, ...) } return(p) } ##' internal function ##' ##' ##' @param tagMatrix tagMatrix ##' @param xlim xlim ##' @param xlab xlab ##' @param ylab ylab ##' @param conf confidence interval ##' @param facet one of 'none', 'row' and 'column' ##' @param free_y if TRUE, y will be scaled by AvgProf ##' @param origin_label the label of the center ##' @param ... additional parameter ##' @importFrom ggplot2 ggplot ##' @importFrom ggplot2 geom_line ##' @importFrom ggplot2 geom_vline ##' @importFrom ggplot2 geom_ribbon ##' @importFrom ggplot2 scale_x_continuous ##' @importFrom ggplot2 scale_color_manual ##' @importFrom ggplot2 xlab ##' @importFrom ggplot2 ylab ##' @importFrom ggplot2 theme_bw ##' @importFrom ggplot2 theme ##' @importFrom ggplot2 element_blank ##' @importFrom ggplot2 facet_grid plotMultiProf.normal.internal <- function(tagMatrix, conf, xlim = c(-3000,3000), xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", facet="row", free_y = TRUE, origin_label, ...) { listFlag <- FALSE if (is.null(attr(tagMatrix[[1]],'upstream'))) { if ( is.null(names(tagMatrix)) ) { nn <- paste0("peak", seq_along(tagMatrix)) warning("input is not a named list, set the name automatically to ", paste(nn, collapse=' ')) names(tagMatrix) <- nn ## stop("tagMatrix should be a named list...") } listFlag <- TRUE } if ( listFlag ) { facet <- match.arg(facet, c("none", "row", "column")) if ( (xlim[2]-xlim[1]+1) != ncol(tagMatrix[[1]][[1]]) ) { stop("please specify appropreate xcoordinations...") } } else { if ( (xlim[2]-xlim[1]+1) != ncol(tagMatrix[[1]]) ) { stop("please specify appropreate xcoordinations...") } } ## S4Vectors change the behavior of ifelse ## see https://support.bioconductor.org/p/70871/ ## ## conf <- ifelse(missingArg(conf), NA, conf) ## conf <- if(missingArg(conf)) NA else conf pos <- value <- .id <- Lower <- Upper <- NULL if ( listFlag ) { tagCount <- lapply(as.list(names(tagMatrix)), function(x){ tmp <- tagMatrix[[x]] tagCount_tmp <- lapply(as.list(names(tmp)),function(x){ result <- getTagCount(tmp[[x]], xlim = xlim, conf = conf, ...) result$type <- x return(result) }) tagCount_tmp <- list_to_dataframe(tagCount_tmp) return(tagCount_tmp) }) names(tagCount) <- names(tagMatrix) tagCount <- list_to_dataframe(tagCount) tagCount$.id <- factor(tagCount$.id, levels=names(tagMatrix)) p <- ggplot(tagCount, aes(pos, group=type, color=type)) if (!(is.na(conf))) { p <- p + geom_ribbon(aes(ymin = Lower, ymax = Upper, fill = type), linetype = 0, alpha = 0.2) } } else { tagCount <- lapply(as.list(names(tagMatrix)), function(x){ result <- getTagCount(tagMatrix[[x]], xlim = xlim, conf = conf, ...) result$type <- x return(result) }) tagCount <- do.call("rbind",tagCount) p <- ggplot(tagCount, aes(x = pos)) if (!(is.na(conf))) { p <- p + geom_ribbon(aes(ymin = Lower, ymax = Upper,fill = type), linetype = 0, alpha = 0.2) } } p <- p + geom_line(aes(y = value,color = type)) if ( 0 > xlim[1] && 0 < xlim[2] ) { p <- p + geom_vline(xintercept=0, linetype="longdash") p <- p + scale_x_continuous(breaks=c(xlim[1], floor(xlim[1]/2), 0, floor(xlim[2]/2), xlim[2]), labels=c(paste0(xlim[1],"bp"), paste0(floor(xlim[1]/2),"bp"), origin_label, paste0(floor(xlim[2]/2),"bp"), paste0(xlim[2], "bp"))) } if (listFlag) { # cols <- getCols(length(tagMatrix[[1]])) # p <- p + scale_color_manual(values=cols) if (facet == "row") { if (free_y) { p <- p + facet_grid(.id ~ ., scales = "free_y") } else { p <- p + facet_grid(.id ~ .) } } else if (facet == "column") { if (free_y) { p <- p + facet_grid(. ~ .id, scales = "free_y") } else { p <- p + facet_grid(. ~ .id) } } } p <- p+xlab(xlab)+ylab(ylab) p <- p + theme_bw() + theme(legend.title=element_blank()) # if(facet != "none") { # p <- p + theme(legend.position="none") # } return(p) } ##' internal function ##' ##' @param tagMatrix tagMatrix ##' @param xlab xlab ##' @param ylab ylab ##' @param conf confidence interval ##' @param facet one of 'none', 'row' and 'column' ##' @param free_y if TRUE, y will be scaled by AvgProf ##' @param upstream the upstream extension ##' @param downstream the downstream extension ##' @param label the label of the center ##' @param ... additional parameter plotMultiProf.binning <- function(tagMatrix, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", conf, facet ="none", free_y = TRUE, upstream = NULL, downstream = NULL, label, ...) { ## S4Vectors change the behavior of ifelse ## see https://support.bioconductor.org/p/70871/ ## ## conf <- ifelse(missingArg(conf), NA, conf) conf <- if(missingArg(conf)) NA else conf if (!(missingArg(conf) || is.na(conf))){ p <- plotMultiProf.binning.internal(tagMatrix , conf = conf, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, upstream = upstream, downstream = downstream, label = label, ...) } else { p <- plotMultiProf.binning.internal(tagMatrix , xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, upstream = upstream, downstream = downstream, label = label, ...) } return(p) } ##' internal function ##' ##' @param tagMatrix tagMatrix ##' @param xlab xlab ##' @param ylab ylab ##' @param conf confidence interval ##' @param facet one of 'none', 'row' and 'column' ##' @param free_y if TRUE, y will be scaled by AvgProf ##' @param upstream the upstream extension ##' @param downstream the downstream extension ##' @param label the label of the center ##' @param ... additional parameter ##' @importFrom ggplot2 ggplot ##' @importFrom ggplot2 geom_line ##' @importFrom ggplot2 geom_vline ##' @importFrom ggplot2 geom_ribbon ##' @importFrom ggplot2 scale_x_continuous ##' @importFrom ggplot2 scale_color_manual ##' @importFrom ggplot2 xlab ##' @importFrom ggplot2 ylab ##' @importFrom ggplot2 theme_bw ##' @importFrom ggplot2 theme ##' @importFrom ggplot2 element_blank ##' @importFrom ggplot2 facet_grid ##' @importFrom ggplot2 rel plotMultiProf.binning.internal <- function(tagMatrix, conf, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", facet="none", free_y = TRUE, upstream = NULL, downstream = NULL, label, ...) { listFlag <- FALSE if (is(tagMatrix[[1]][[1]],"matrix")) { if ( is.null(names(tagMatrix)) ) { nn <- paste0("peak", seq_along(tagMatrix)) warning("input is not a named list, set the name automatically to ", paste(nn, collapse=' ')) names(tagMatrix) <- nn ## stop("tagMatrix should be a named list...") } listFlag <- TRUE } if(listFlag){ nbin <- dim(tagMatrix[[1]][[1]])[2] type <- attr(tagMatrix[[1]][[1]], 'type') }else{ nbin <- dim(tagMatrix[[1]])[2] type <- attr(tagMatrix[[1]], 'type') } xlim <- c(1,nbin) if ( listFlag ) { facet <- match.arg(facet, c("none", "row", "column")) } ## S4Vectors change the behavior of ifelse ## see https://support.bioconductor.org/p/70871/ ## ## conf <- ifelse(missingArg(conf), NA, conf) ## conf <- if(missingArg(conf)) NA else conf pos <- value <- .id <- Lower <- Upper <- NULL if ( listFlag ) { tagCount <- lapply(as.list(names(tagMatrix)), function(x){ tmp <- tagMatrix[[x]] tagCount_tmp <- lapply(as.list(names(tmp)),function(x){ result <- getTagCount(tmp[[x]], xlim = xlim, conf = conf, ...) result$type <- x return(result) }) tagCount_tmp <- list_to_dataframe(tagCount_tmp) return(tagCount_tmp) }) names(tagCount) <- names(tagMatrix) tagCount <- list_to_dataframe(tagCount) tagCount$.id <- factor(tagCount$.id, levels=names(tagMatrix)) p <- ggplot(tagCount, aes(pos, group=type, color=type)) if (!(is.na(conf))) { p <- p + geom_ribbon(aes(ymin = Lower, ymax = Upper, fill = type), linetype = 0, alpha = 0.2) } } else { tagCount <- lapply(as.list(names(tagMatrix)), function(x){ result <- getTagCount(tagMatrix[[x]], xlim = xlim, conf = conf, ...) result$type <- x return(result) }) tagCount <- do.call("rbind",tagCount) p <- ggplot(tagCount, aes(pos,group=type,color=type)) if (!(is.na(conf))) { p <- p + geom_ribbon(aes(ymin = Lower, ymax = Upper,fill = type), linetype = 0, alpha = 0.2) } } p <- p + geom_line(aes(y = value,color = type)) ## x_scale for genebody if(type == 'body'){ ## x_scale for gene body with no flank extension if(is.null(upstream)){ p <- p + scale_x_continuous(breaks=c(1, floor(nbin*0.25), floor(nbin*0.5), floor(nbin*0.75), nbin), labels=c(label[1], "25%", "50%", "75%", label[2])) } ## x_scale for flank extension by relative value if(inherits(upstream, 'rel')){ p <- p + scale_x_continuous(breaks=c(1, floor(nbin*(as.numeric(upstream)*100/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+25)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+50)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+75)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+100)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), nbin), labels=c(paste0("-",as.numeric(upstream)*100,"%"), label[1], "25%", "50%", "75%", label[2], paste0("+",as.numeric(downstream)*100,"%"))) p <- p + geom_vline(xintercept=floor(nbin*(as.numeric(upstream)*100/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), linetype="longdash") p <- p + geom_vline(xintercept=floor(nbin*((as.numeric(upstream)*100+100)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), linetype="longdash") } ## x_scale for flank extension by absolute value if(!is.null(upstream) & !inherits(upstream, 'rel')){ upstreamPer <- floor(upstream/1000)*0.1 downstreamPer <- floor(downstream/1000)*0.1 p <- p + scale_x_continuous(breaks=c(1, floor(nbin*(upstreamPer/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+0.25)/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+0.5)/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+0.75)/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+1)/(1+upstreamPer+downstreamPer))), nbin), labels=c(paste0("-",upstream,"bp"), label[1], "25%", "50%", "75%", label[2], paste0(downstream,"bp"))) p <- p + geom_vline(xintercept=floor(nbin*(upstreamPer/(1+upstreamPer+downstreamPer))), linetype="longdash") p <- p + geom_vline(xintercept=floor(nbin*((upstreamPer+1)/(1+upstreamPer+downstreamPer))), linetype="longdash") } } ## x_scale for start region if(type != 'body'){ p <- p + scale_x_continuous(breaks=c(1, floor(nbin*0.25), floor(nbin*0.5), floor(nbin*0.75), nbin), labels=c(paste0("-",upstream,"bp"), paste0("-",floor(upstream*0.5),"bp"), label, paste0(floor(downstream*0.5),"bp"), paste0(downstream,"bp"))) p <- p + geom_vline(xintercept=floor(nbin*0.5), linetype="longdash") } if (listFlag) { if (facet == "row") { if (free_y) { p <- p + facet_grid(.id ~ ., scales = "free_y") } else { p <- p + facet_grid(.id ~ .) } } else if (facet == "column") { if (free_y) { p <- p + facet_grid(. ~ .id, scales = "free_y") } else { p <- p + facet_grid(. ~ .id) } } } p <- p+xlab(xlab)+ylab(ylab) p <- p + theme_bw() + theme(legend.title=element_blank()) # if(facet != "none") { # p <- p + theme(legend.position="none") # } return(p) } ##' plot the heatmap of tagMatrix ##' ##' ##' @title tagHeatmap ##' @param tagMatrix tagMatrix or a list of tagMatrix ##' @param xlab xlab ##' @param ylab ylab ##' @param title title ##' @param palette palette to be filled in,details see \link[ggplot2]{scale_colour_brewer} ##' @param nrow the nrow of plotting a list of peak ##' @param ncol the ncol of plotting a list of peak ##' @return figure ##' @export ##' @author G Yu tagHeatmap <- function(tagMatrix, xlab="", ylab="", title=NULL, palette="RdBu", nrow = NULL, ncol = NULL) { listFlag <- FALSE if (is(tagMatrix, "list")) { listFlag <- TRUE } peakHeatmap.internal2(tagMatrix = tagMatrix, listFlag = listFlag, palette = palette, xlab = xlab, ylab = ylab, title = title, ncol = ncol, nrow = nrow) } ##' plot the heatmap of peaks ##' ##' ##' @title peakHeatmap ##' @param peak peak file or GRanges object ##' @param weightCol column name of weight ##' @param TxDb TxDb object ##' @param upstream upstream position ##' @param downstream downstream position ##' @param xlab xlab ##' @param ylab ylab ##' @param title title ##' @param palette palette to be filled in,details see \link[ggplot2]{scale_colour_brewer} ##' @param verbose print message or not ##' @param by one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR', 'UTR' ##' @param type one of "start_site", "end_site", "body" ##' @param nbin the amount of nbines ##' @param ignore_strand ignore the strand information or not ##' @param windows a collection of region ##' @param nrow the nrow of plotting a list of peak ##' @param ncol the ncol of plotting a list of peak ##' @return figure ##' @export ##' @author G Yu peakHeatmap <- function(peak, weightCol=NULL, TxDb=NULL, upstream=1000, downstream=1000, xlab="", ylab="", title=NULL, palette=NULL, verbose=TRUE, by="gene", type="start_site", nbin = NULL,ignore_strand = FALSE, windows,ncol = NULL, nrow = NULL) { listFlag <- FALSE if ( is(peak, "list") ) { listFlag <- TRUE if (is.null(names(peak))) stop("peak should be a peak file or a name list of peak files...") } if (verbose) { cat(">> preparing promoter regions...\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") } if (verbose) { cat(">> preparing tag matrix...\t\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") } if(missing(windows)){ windows <- getBioRegion(TxDb=TxDb, upstream=upstream, downstream=downstream, by=by, type=type) } if (listFlag) { tagMatrix <- lapply(peak, getTagMatrix, weightCol=weightCol, windows = windows, upstream=upstream, downstream=downstream, TxDb = TxDb, nbin = nbin, verbose = verbose, ignore_strand= ignore_strand) names(tagMatrix) <- names(peak) } else { tagMatrix <- getTagMatrix(peak, weightCol=weightCol, windows = windows, TxDb = TxDb, upstream=upstream, downstream=downstream, nbin = nbin, verbose = verbose, ignore_strand= ignore_strand) } if (verbose) { cat(">> generating figure...\t\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") } xlim <- NULL p <- peakHeatmap.internal2(tagMatrix = tagMatrix, listFlag = listFlag, palette = palette, xlab = xlab, ylab = ylab, title = title, nrow = nrow, ncol = ncol) if (verbose) { cat(">> done...\t\t\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") } invisible(tagMatrix) p } ##' @importFrom aplot plot_list peakHeatmap.internal2 <- function(tagMatrix, listFlag, palette, xlab, ylab, title, nrow, ncol) { if ( is.null(xlab) || is.na(xlab)) xlab <- "" if ( is.null(ylab) || is.na(ylab)) ylab <- "" if (listFlag) { nc <- length(tagMatrix) if ( is.null(palette) || is.na(palette) ) { palette <- getPalette(nc) } else if (length(palette) != nc) { palette <- rep(palette[1], nc) } else { palette <- palette } if (is.null(title) || is.na(title)) title <- names(tagMatrix) if (length(xlab) != nc) { xlab <- rep(xlab[1], nc) } if (length(ylab) != nc) { ylab <- rep(ylab[1], nc) } if (length(title) != nc) { title <- rep(title[1], nc) } tmp <- list() for (i in 1:nc) { p <- peakHeatmap.internal(tagMatrix = tagMatrix[[i]], palette = palette[i], xlab = xlab[i], ylab = ylab[i], title= title[i]) p <- p + theme(plot.title = element_text(hjust = 0.5)) tmp[[i]] <- p } if(is.null(nrow) && is.null(ncol)) nrow <- 1 p <- plot_list(gglist = tmp, ncol = ncol, nrow = nrow) return(p) } else { if (is.null(palette) || is.na(palette)) palette <- "RdBu" if (is.null(title) || is.na(title)) title <- "" peakHeatmap.internal(tagMatrix = tagMatrix, palette = palette, xlab = xlab, ylab = ylab, title = title) } } ##' @import BiocGenerics ##' @importFrom yulab.utils mat2df ##' @importFrom ggplot2 ggplot ##' @importFrom ggplot2 aes ##' @importFrom ggplot2 geom_tile ##' @importFrom ggplot2 scale_fill_distiller ##' @importFrom ggplot2 theme ##' @importFrom ggplot2 element_blank ##' @importFrom ggplot2 labs ##' @importFrom ggplot2 scale_x_continuous peakHeatmap.internal <- function(tagMatrix, palette="RdBu", xlab="", ylab="", title="") { upstream <- attr(tagMatrix, "upstream") downstream <- attr(tagMatrix, "downstream") binning_Flag <- attr(tagMatrix,"is.binning") type <- attr(tagMatrix,"type") body_Flag <- FALSE if(type == "body"){ body_Flag <- TRUE label <- attr(tagMatrix,"label") } if(binning_Flag){ nbin <- dim(tagMatrix)[2] } tagMatrix <- t(apply(tagMatrix, 1, function(x) x/max(x))) ii <- order(rowSums(tagMatrix)) tagMatrix <- tagMatrix[ii,] colnames(tagMatrix) <- seq_len(dim(tagMatrix)[2]) rownames(tagMatrix) <- seq_len(dim(tagMatrix)[1]) tagMatrix <- mat2df(tagMatrix) colnames(tagMatrix) <- c("values","sample_ID","coordinate") sample_ID <- coordinate <- NULL p <- ggplot(tagMatrix, aes(x = coordinate,y = sample_ID)) + geom_tile(aes(fill = values)) + scale_fill_distiller(palette = palette) + theme(axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.line.y = element_blank(), panel.grid=element_blank(), panel.background = element_blank()) + labs(x = xlab, y = ylab, title = title) if(body_Flag){ if(inherits(upstream, 'rel')){ p <- p + scale_x_continuous(breaks=c(1, floor(nbin*(as.numeric(upstream)*100/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+25)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+50)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+75)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+100)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), nbin), labels=c(paste0("-",as.numeric(upstream)*100,"%"), label[1], "25%", "50%", "75%", label[2], paste0("+",as.numeric(downstream)*100,"%"))) } if(is.null(upstream)){ p <- p + scale_x_continuous(breaks=c(1, floor(nbin*0.25), floor(nbin*0.5), floor(nbin*0.75), nbin), labels=c(label[1], "25%", "50%", "75%", label[2])) } if(!is.null(upstream) && !inherits(upstream, 'rel')){ upstreamPer <- floor(upstream/1000)*0.1 downstreamPer <- floor(downstream/1000)*0.1 p <- p + scale_x_continuous(breaks=c(1, floor(nbin*(upstreamPer/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+0.25)/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+0.5)/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+0.75)/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+1)/(1+upstreamPer+downstreamPer))), nbin), labels=c(paste0("-",upstream,"bp"), label[1], "25%", "50%", "75%", label[2], paste0(downstream,"bp"))) } p <- p + scale_y_continuous(expand = c(0,0)) return(p) } if(binning_Flag){ p <- p + scale_x_continuous(breaks = c(1, floor(nbin*(downstream*0.5/(downstream+upstream))), floor(nbin*(downstream/(downstream+upstream))), floor(nbin*((downstream + upstream*0.5)/(downstream+upstream))), nbin), labels = c((-1*downstream), floor(-1*downstream*0.5), 0, floor(upstream*0.5), upstream)) }else{ p <- p + scale_x_continuous(labels = function(x) x - upstream) } p <- p + scale_y_continuous(expand = c(0,0)) p } ##' plot the heatmap of peaks align to a sets of regions ##' ##' ##' @title peakHeatmap ##' @param peak peak file or GRanges object ##' @param weightCol column name of weight ##' @param TxDb TxDb object ##' @param upstream upstream position ##' @param downstream downstream position ##' @param xlab xlab ##' @param ylab ylab ##' @param title title ##' @param palette palette to be filled in,details see \link[ggplot2]{scale_colour_brewer} ##' @param verbose print message or not ##' @param by one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR', 'UTR' ##' @param type one of "start_site", "end_site", "body" ##' @param nbin the amount of nbines ##' @param ignore_strand ignore the strand information or not ##' @param windows_name the name for each window, which will also be showed in the picture as labels ##' @param nrow the nrow of plotting a list of peak ##' @param ncol the ncol of plotting a list of peak ##' @param facet_label_text_size the size of facet label text ##' @importFrom ggplot2 ggplot ##' @importFrom ggplot2 aes ##' @importFrom ggplot2 geom_tile ##' @importFrom ggplot2 scale_fill_distiller ##' @importFrom ggplot2 theme ##' @importFrom ggplot2 element_blank ##' @importFrom ggplot2 labs ##' @importFrom ggplot2 scale_x_continuous ##' @return figure ##' @export peakHeatmap_multiple_Sets <- function(peak, weightCol=NULL, TxDb=NULL, upstream=1000, downstream=1000, xlab="", ylab="", title=NULL, palette=NULL, verbose=TRUE, by="gene", type="start_site", nbin = NULL, ignore_strand = FALSE, windows_name = NULL, ncol = NULL, nrow = NULL, facet_label_text_size = 12){ listFlag <- FALSE if ( is(peak, "list") ) { listFlag <- TRUE if (is.null(names(peak))) stop("peak should be a peak file or a name list of peak files...") } if (verbose) { cat(">> preparing promoter regions...\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") } ## check type if(length(type) != 1){ stop("It should be only one type...") } if(is.null(windows_name) && !is.null(names(TxDb))) windows_name <- names(TxDb) ## make the window name if (is.null(windows_name)) { nn <- by warning("set the name automatically to ", paste(nn, collapse=' ')) windows_name <- nn }else{ if (length(windows_name) != length(by)) { stop("the length of the window name and the by should be equal...") } } if ( is(peak, "list") ) { tagMatrix <- lapply(peak, getTagMatrix2, upstream=upstream, downstream=downstream, windows_name=windows_name, type=type, by=by, TxDb=TxDb, weightCol = weightCol, nbin = nbin, verbose = verbose, ignore_strand= ignore_strand) } else { tagMatrix <- getTagMatrix2(peak=peak, upstream=upstream, downstream=downstream, windows_name=windows_name, type=type, by=by, TxDb=TxDb, weightCol = weightCol, nbin = nbin, verbose = verbose, ignore_strand= ignore_strand) } if(listFlag){ nc <- length(tagMatrix) if ( is.null(palette) || is.na(palette) ) { palette <- getPalette(nc) } else if (length(palette) != nc) { palette <- rep(palette[1], nc) } else { palette <- palette } if (is.null(title) || is.na(title)) title <- names(tagMatrix) if (length(xlab) != nc) { xlab <- rep(xlab[1], nc) } if (length(ylab) != nc) { ylab <- rep(ylab[1], nc) } if (length(title) != nc) { title <- rep(title[1], nc) } tmp <- list() for (i in 1:nc) { p <- peakHeatmap_multiple_Sets.internal(tagMatrix = tagMatrix[[i]], upstream=upstream, downstream=downstream, xlab=xlab[[i]], ylab=ylab[[i]], title=title[[i]], palette=palette[[i]], ncol = ncol, nrow = nrow, facet_label_text_size = facet_label_text_size) p <- p + theme(plot.title = element_text(hjust = 0.5)) tmp[[i]] <- p } if(is.null(nrow) && is.null(ncol)) nrow <- 1 p <- plot_list(gglist = tmp, ncol = ncol, nrow = nrow) }else{ if (is.null(palette) || is.na(palette)) palette <- "RdBu" if (is.null(title) || is.na(title)) title <- "" p <- peakHeatmap_multiple_Sets.internal(tagMatrix = tagMatrix, upstream=upstream, downstream=downstream, xlab=xlab, ylab=ylab, title=title, palette=palette, ncol = ncol, nrow = nrow, facet_label_text_size = facet_label_text_size) } return(p) } ##' @importFrom yulab.utils mat2df ##' @importFrom ggplot2 ggplot ##' @importFrom ggplot2 aes ##' @importFrom ggplot2 geom_tile ##' @importFrom ggplot2 scale_fill_distiller ##' @importFrom ggplot2 theme ##' @importFrom ggplot2 element_blank ##' @importFrom ggplot2 labs ##' @importFrom ggplot2 scale_x_continuous ##' @importFrom ggplot2 facet_grid ##' @importFrom ggplot2 element_text ##' @importFrom ggplot2 element_blank peakHeatmap_multiple_Sets.internal <- function(tagMatrix, upstream=1000, downstream=1000, xlab="", ylab="", title=NULL, palette=NULL, ncol = NULL, nrow = NULL, facet_label_text_size = 12){ binning_Flag <- attr(tagMatrix[[1]],"is.binning") if(binning_Flag) nbin <- dim(tagMatrix[[1]])[2] type <- attr(tagMatrix,"type") body_Flag <- FALSE if(attr(tagMatrix[[1]],"type") == "body"){ body_Flag <- TRUE label <- attr(tagMatrix,"label") } name_of_list <- as.list(names(tagMatrix)) peak_list <- lapply(name_of_list,function(x){ tagMatrix[[x]] <- t(apply(tagMatrix[[x]], 1, function(x) x/max(x))) ii <- order(rowSums(tagMatrix[[x]])) tagMatrix[[x]] <- tagMatrix[[x]][ii,] colnames(tagMatrix[[x]]) <- seq_len(dim(tagMatrix[[x]])[2]) rownames(tagMatrix[[x]]) <- seq_len(dim(tagMatrix[[x]])[1]) tagMatrix[[x]] <- mat2df(tagMatrix[[x]]) colnames(tagMatrix[[x]]) <- c("values","sample_ID","coordinate") tagMatrix[[x]]$sample <- x return(tagMatrix[[x]]) }) peak_df <- list_to_dataframe(peak_list) sample_ID <- coordinate <- NULL p <- ggplot(peak_df, aes(x = coordinate,y = sample_ID)) + geom_tile(aes(fill = values)) + scale_fill_distiller(palette = palette) + theme(axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.line.y = element_blank(), panel.grid=element_blank(), panel.background = element_blank()) + labs(x = xlab, y = ylab, title = title) if(body_Flag){ if(inherits(upstream, 'rel')){ p <- p + scale_x_continuous(breaks=c(1, floor(nbin*(as.numeric(upstream)*100/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+25)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+50)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+75)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), floor(nbin*((as.numeric(upstream)*100+100)/(100+(as.numeric(upstream)+as.numeric(downstream))*100))), nbin), labels=c(paste0("-",as.numeric(upstream)*100,"%"), label[1], "25%", "50%", "75%", label[2], paste0("+",as.numeric(downstream)*100,"%"))) } if(is.null(upstream)){ p <- p + scale_x_continuous(breaks=c(1, floor(nbin*0.25), floor(nbin*0.5), floor(nbin*0.75), nbin), labels=c(label[1], "25%", "50%", "75%", label[2])) } if(!is.null(upstream) && !inherits(upstream, 'rel')){ upstreamPer <- floor(upstream/1000)*0.1 downstreamPer <- floor(downstream/1000)*0.1 p <- p + scale_x_continuous(breaks=c(1, floor(nbin*(upstreamPer/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+0.25)/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+0.5)/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+0.75)/(1+upstreamPer+downstreamPer))), floor(nbin*((upstreamPer+1)/(1+upstreamPer+downstreamPer))), nbin), labels=c(paste0("-",upstream,"bp"), label[1], "25%", "50%", "75%", label[2], paste0(downstream,"bp"))) } p <- p + facet_grid(sample ~ .,switch = "y",space = "free_y",scales = "free_y") + theme(strip.text.y.left = element_text(color = "black",face = "bold", size = facet_label_text_size), strip.background = element_blank()) return(p) } if(binning_Flag){ p <- p + scale_x_continuous(breaks = c(1, floor(nbin*(downstream*0.5/(downstream+upstream))), floor(nbin*(downstream/(downstream+upstream))), floor(nbin*((downstream + upstream*0.5)/(downstream+upstream))), nbin), labels = c((-1*downstream), floor(-1*downstream*0.5), 0, floor(upstream*0.5), upstream)) }else{ p <- p + scale_x_continuous(breaks = c(1, floor(downstream*0.5), (downstream + 1), (downstream + 1 + floor(upstream * 0.5)), upstream+downstream+1), labels = c((-1*downstream), floor(-1*downstream*0.5), 0, floor(upstream*0.5), upstream)) } p <- p + facet_grid(sample ~ .,switch = "y",scales = "free_y",space = "free") + theme(strip.text.y.left = element_text(color = "black",face = "bold", size = facet_label_text_size), strip.background = element_blank()) + scale_y_continuous(expand = c(0,0)) return(p) } ##' plot peak heatmap and profile in a picture ##' ##' ##' @title peak_Profile_Heatmap ##' @param peak peak file or GRanges object ##' @param weightCol column name of weight ##' @param TxDb TxDb object ##' @param upstream upstream position ##' @param downstream downstream position ##' @param xlab xlab ##' @param ylab ylab ##' @param title title ##' @param palette palette to be filled in,details see \link[ggplot2]{scale_colour_brewer} ##' @param verbose print message or not ##' @param by one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR', 'UTR' ##' @param type one of "start_site", "end_site", "body" ##' @param nbin the amount of nbines ##' @param ignore_strand ignore the strand information or not ##' @param windows_name the name for each window, which will also be showed in the picture as labels ##' @param nrow the nrow of plotting a list of peak ##' @param ncol the ncol of plotting a list of peak ##' @param facet_label_text_size the size of facet label text ##' @param conf confidence interval ##' @param facet one of 'none', 'row' and 'column' ##' @param free_y if TRUE, y will be scaled by AvgProf ##' @param height_proportion the proportion of profiling picture and heatmap ##' @importFrom aplot insert_bottom ##' @importFrom aplot plot_list ##' @export peak_Profile_Heatmap <- function(peak, weightCol=NULL, TxDb=NULL, upstream=1000, downstream=1000, xlab="", ylab="", title=NULL, palette=NULL, verbose=TRUE, by="gene", type="start_site", nbin = NULL, ignore_strand = FALSE, windows_name = NULL, ncol = NULL, nrow = NULL, facet_label_text_size = 12, conf, facet = "row", free_y = TRUE, height_proportion = 4){ conf <- if(missingArg(conf)) NA else conf if(is(peak, "list")){ nc <- length(peak) tmp <- list() if ( is.null(names(peak)) ) { nn <- paste0("peak", seq_along(peak)) warning("input is not a named list, set the name automatically to ", paste(nn, collapse=' ')) names(peak) <- nn ## stop("tagMatrix should be a named list...") } if(is.null(palette)) palette <- getPalette(nc) if(is.null(title)) title_of_plot <- names(peak) for (i in 1:nc) { peak_profile <- plotPeakProf(peak = peak[[i]], upstream = upstream, downstream = downstream, conf = conf, by = by, type = type, windows_name = windows_name, weightCol = weightCol, TxDb = TxDb, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, verbose = verbose, nbin = nbin, ignore_strand = ignore_strand) peak_profile <- peak_profile + labs(title = title_of_plot[i]) + theme(plot.title = element_text(hjust = 0.5)) if(length(by) != 1){ peak_heatmap <- peakHeatmap_multiple_Sets(peak = peak[[i]], weightCol=weightCol, TxDb=TxDb, upstream=upstream, downstream=downstream, xlab=xlab, ylab=ylab, title=title, palette=palette[[i]], verbose=verbose, by=by, type=type, nbin = nbin, ignore_strand = ignore_strand, windows_name = windows_name, ncol = ncol, nrow = nrow, facet_label_text_size = facet_label_text_size) }else{ peak_heatmap <- peakHeatmap(peak[[i]], weightCol=weightCol, TxDb=TxDb, upstream=upstream, downstream=downstream, xlab=xlab, ylab=ylab, title=title, palette=palette[[i]], verbose=verbose, by=by, type=type, nbin = nbin, ignore_strand = ignore_strand, ncol = ncol, nrow = nrow) } p <- peak_profile %>% insert_bottom(peak_heatmap,height = height_proportion) tmp[[i]] <- p } if (is.null(ncol) && is.null(nrow)) nrow <- 1 p <- plot_list(gglist = tmp, ncol = ncol, nrow = nrow) return(p) } peak_profile <- plotPeakProf(peak = peak, upstream = upstream, downstream = downstream, conf = conf, by = by, type = type, windows_name = windows_name, weightCol = weightCol, TxDb = TxDb, xlab = xlab, ylab = ylab, facet = facet, free_y = free_y, verbose = verbose, nbin = nbin, ignore_strand = ignore_strand) if(length(by) != 1){ peak_heatmap <- peakHeatmap_multiple_Sets(peak = peak, weightCol=weightCol, TxDb=TxDb, upstream=upstream, downstream=downstream, xlab=xlab, ylab=ylab, title=title, palette=palette, verbose=verbose, by=by, type=type, nbin = nbin, ignore_strand = ignore_strand, windows_name = windows_name, ncol = ncol, nrow = nrow, facet_label_text_size = facet_label_text_size) }else{ peak_heatmap <- peakHeatmap(peak = peak, weightCol=weightCol, TxDb=TxDb, upstream=upstream, downstream=downstream, xlab=xlab, ylab=ylab, title=title, palette=palette, verbose=verbose, by=by, type=type, nbin = nbin, ignore_strand = ignore_strand, ncol = ncol, nrow = nrow) } p <- peak_profile %>% insert_bottom(peak_heatmap,height = height_proportion) return(p) } ================================================ FILE: R/readPeakFile.R ================================================ ##' read peak file and store in data.frame or GRanges object ##' ##' ##' @title readPeakFile ##' @param peakfile peak file ##' @param as output format, one of GRanges or data.frame ##' @param ... additional parameter (pass to `utils::read.delim()`) ##' @return peak information, in GRanges or data.frame object ##' @import IRanges GenomicRanges ##' @export ##' @examples ##' peakfile <- system.file("extdata", "sample_peaks.txt", package="ChIPseeker") ##' peak.gr <- readPeakFile(peakfile, as="GRanges") ##' peak.gr ##' @author G Yu readPeakFile <- function(peakfile, as="GRanges", ...) { as <- match.arg(as, c("GRanges", "data.frame")) peak.df <- peak2DF(peakfile, ...) if (as == "data.frame") return(peak.df) peak.gr <- peakDF2GRanges(peak.df) return(peak.gr) } peakDF2GRanges <- function(peak.df) { peak.gr=GRanges(seqnames=peak.df[,1], ranges=IRanges(peak.df[,2], peak.df[,3])) cn <- colnames(peak.df) if (length(cn) > 3) { for (i in 4:length(cn)) { mcols(peak.gr)[[cn[i]]] <- peak.df[, cn[i]] } } return(peak.gr) } ##' @importFrom utils read.delim peak2DF <- function(peakfile, header, ...) { if (missing(header)) { ## determine file format if (isBedFile(peakfile)) { header <- FALSE } else { header <- TRUE } } peak.df <- read.delim(peakfile, header=header, comment.char="#", ...) ## coordinate system in BED file is start at 0 ## refer to http://asia.ensembl.org/info/website/upload/bed.html?redirect=no ## The chromEnd base is not included in the display of the feature. ## For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, ## and span the bases numbered 0-99. ## so chromEnd, peak.df[,3], is not needed to +1 peak.df[,2] <- peak.df[,2] + 1 return(peak.df) } isBedFile <- function(peakfile) { ## peakfile is a peak file name grepl("\\.bed$", peakfile) || grepl("\\.bed.gz$", peakfile) || grepl("\\Peak.gz$", peakfile) || grepl("\\.bedGraph.gz$", peakfile) || grepl("\\.narrowPeak$", peakfile) || grepl("\\.broadPeak$",peakfile) || grepl("\\.gappedPeak$", peakfile) } ================================================ FILE: R/seq2gene.R ================================================ ##' annotate genomic regions to genes in many-to-many mapping ##' ##' This funciton associates genomic regions with coding genes in a many-to-many mapping. It first maps genomic regions to host genes (either located in exon or intron), proximal genes (located in promoter regions) and flanking genes (located in upstream and downstream within user specify distance). ##' @title seq2gene ##' @param seq genomic regions in GRanges object ##' @param tssRegion TSS region ##' @param flankDistance flanking search radius ##' @param TxDb TranscriptDb object ##' @param sameStrand logical whether find nearest/overlap gene in the same strand ##' @return gene vector ##' @export ##' @examples ##' \dontrun{ ##' library(TxDb.Hsapiens.UCSC.hg19.knownGene) ##' TxDb <- TxDb.Hsapiens.UCSC.hg19.knownGene ##' file <- getSampleFiles()[[1]] # a bed file ##' gr <- readPeakFile(file) ##' genes <- seq2gene(gr, tssRegion=c(-1000, 1000), flankDistance = 3000, TxDb) ##' } ##' @importFrom yulab.utils get_cache_element ##' @importFrom yulab.utils update_cache_item ##' @author Guangchuang Yu seq2gene <- function(seq, tssRegion, flankDistance, TxDb, sameStrand=FALSE) { .ChIPseekerEnv(TxDb, item = ChIPseekerCache) # ChIPseekerEnv <- get("ChIPseekerEnv", envir=.GlobalEnv) ## Exons exonList <- get_cache_element(item = ChIPseekerCache, elements = "exonList") if(is.null(exonList)){ exonList <- exonsBy(TxDb) update_cache_item(item = ChIPseekerCache, list("exonList" = exonList)) } # if ( exists("exonList", envir=ChIPseekerEnv, inherits=FALSE) ) { # exonList <- get("exonList", envir=ChIPseekerEnv) # } else { # exonList <- exonsBy(TxDb) # assign("exonList", exonList, envir=ChIPseekerEnv) # } exons <- getGenomicAnnotation.internal(seq, exonList, type = "Exon", sameStrand=sameStrand) ## Introns intronList <- get_cache_element(item = ChIPseekerCache, elements = "intronList") if(is.null(intronList)){ intronList <- intronsByTranscript(TxDb) update_cache_item(item = ChIPseekerCache, list("intronList" = intronList)) } # if ( exists("intronList", envir=ChIPseekerEnv, inherits=FALSE) ) { # intronList <- get("intronList", envir=ChIPseekerEnv) # } else { # intronList <- intronsByTranscript(TxDb) # assign("intronList", intronList, envir=ChIPseekerEnv) # } introns <- getGenomicAnnotation.internal(seq, intronList, type="Intron", sameStrand=sameStrand) genes <- c(exons$gene, introns$gene) ## > head(genes) ## [1] "uc001aed.3/126789" "uc001aka.3/440556" "uc001ako.3/49856" ## [4] "uc001alg.3/100133612" "uc009vly.2/390992" "uc001awv.2/79814" genes <- gsub("\\w+\\.*\\d*/(\\d+)", "\\1", genes) ## > head(genes) ## [1] "126789" "440556" "49856" "100133612" "390992" "79814" features <- getGene(TxDb, by="gene") idx.dist <- getNearestFeatureIndicesAndDistances(seq, features, sameStrand=sameStrand) nearestFeatures <- features[idx.dist$index] distance <- idx.dist$distance pi <- distance > tssRegion[1] & distance < tssRegion[2] promoters <- mcols(nearestFeatures[pi])[["gene_id"]] nearest_genes <- mcols(nearestFeatures[!pi][abs(distance[!pi]) < flankDistance])[["gene_id"]] genes <- c(genes, promoters, nearest_genes) return(unique(genes)) } ================================================ FILE: R/subset.R ================================================ ##' @importFrom S4Vectors subset ##' @importFrom BiocGenerics start ##' @importFrom BiocGenerics end ##' @method subset csAnno ##' @export subset.csAnno <- function(x, ... ){ index <- paste(seqnames(x@anno),start(x@anno),end(x@anno), sep = "_") # subset the GRanges x@anno <- subset(x@anno, ...) index2 <- paste(seqnames(x@anno),start(x@anno),end(x@anno), sep = "_") # the tssRgion, level, hsaGenomicAnnotation keep unchanged # change the detailGenomicAnnotation x@detailGenomicAnnotation <- x@detailGenomicAnnotation[index %in% index2,] # change the annotation stat x@annoStat <- getGenomicAnnoStat(x@anno) # change peak number x@peakNum <- length(x@anno) return(x) } ================================================ FILE: R/tagMatrix.R ================================================ ##' prepare the promoter regions ##' ##' ##' @title getPromoters ##' @param TxDb TxDb ##' @param upstream upstream from TSS site ##' @param downstream downstream from TSS site ##' @param by one of gene or transcript ##' @return GRanges object ##' @export getPromoters <- function(TxDb=NULL, upstream=1000, downstream=1000, by = "gene") { getBioRegion(TxDb = TxDb, upstream = upstream, downstream = downstream, by = by, type = "start_site") } ##' prepare a bioregion of selected feature ##' ##' this function combined previous functions getPromoters(), getBioRegion() and getGeneBody() in order ##' to solve the following issues. ##' ##' (1) \url{https://github.com/GuangchuangYu/ChIPseeker/issues/16} ##' ##' (2) \url{https://github.com/GuangchuangYu/ChIPseeker/issues/87} ##' ##' The getBioRegion() function can prevoid a region of interest from ##' \code{txdb} object. There are three kinds of regions, \code{start_site}, ##' \code{end_site} and \code{body}. ##' ##' We take transcript region to expain the differences of these three regions. ##' tx: chr1 1000 1400. ##' ##' \code{body} region refers to the 1000-1400bp. ##' ##' \code{start_site} region with \code{upstream = 100, downstream = 100} refers to 900-1100bp. ##' ##' \code{end_site} region with \code{upstream = 100, downstream = 100} refers to 1300-1500bp. ##' ##' @title getBioRegion ##' @param TxDb TxDb ##' @param upstream upstream from start site or end site ##' @param downstream downstream from start site or end site ##' @param by one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR', 'UTR' ##' @param type one of "start_site", "end_site", "body" ##' @return GRanges object ##' @import BiocGenerics IRanges GenomicRanges ##' @importFrom yulab.utils get_cache_item ##' @author Guangchuang Yu, Ming L ##' @export getBioRegion <- function(TxDb=NULL, upstream=1000, downstream=1000, by="gene", type="start_site"){ by <- match.arg(by, c('gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR','UTR')) type <- match.arg(type, c("start_site", "end_site", "body")) TxDb <- loadTxDb(TxDb) .ChIPseekerEnv(TxDb, item = ChIPseekerCache) # ChIPseekerEnv <- get("ChIPseekerEnv", envir=.GlobalEnv) label <- make_label(type = type, by = by) if(by == 'gene' || by == 'transcript'){ regions <- getGene(TxDb, by) } if (by == "exon") { # exonList <- get_exonList(ChIPseekerEnv) exonList <- get_exonList(item = ChIPseekerCache) regions <- unlist(exonList) } if (by == "intron") { # intronList <- get_intronList(ChIPseekerEnv) intronList <- get_intronList(item = ChIPseekerCache) regions <- unlist(intronList) } if (by == "3UTR") { threeUTRList <- threeUTRsByTranscript(TxDb) regions <- unlist(threeUTRList) } if (by == "5UTR") { fiveUTRList <- fiveUTRsByTranscript(TxDb) regions <- unlist(fiveUTRList) } if (by == 'UTR'){ three_URT <- threeUTRsByTranscript(TxDb) three_UTR_regions <- unlist(three_URT) five_UTR <- fiveUTRsByTranscript(TxDb) five_UTR_regions <- unlist(five_UTR) regions <- c(three_UTR_regions,five_UTR_regions) } if(type == "start_site"){ coordinate<- ifelse(strand(regions) == "+", start(regions), end(regions)) }else if(type == "end_site"){ coordinate<- ifelse(strand(regions) == "+", end(regions), start(regions)) }else{ ## assign attribute attr(regions, 'type') = type attr(regions, 'by') = by attr(regions, 'label') = label return(regions) } ## issue and code obtained from Chen Ting(NIH/NCI) start_site <- ifelse(strand(regions) == "+",coordinate-upstream, coordinate-downstream) end_site <- ifelse(strand(regions) == "+", coordinate+downstream, coordinate+upstream) bioRegion <- GRanges(seqnames=seqnames(regions), ranges=IRanges(start_site, end_site), strand=strand(regions)) bioRegion <- unique(bioRegion) ## assign attribute attr(bioRegion, 'type') = type attr(bioRegion, 'by') = by ## different region have different label to be added to the figures ## so we attach label to the Granges object attr(bioRegion, 'label') = label attr(bioRegion, 'upstream') = upstream attr(bioRegion, 'downstream') = downstream return(bioRegion) } ##' make windows from granges object ##' ##' \code{makeBioRegionFromGranges()} function can make bioregion from granges object. ##' ##' The differences between \code{makeBioRegionFromGranges()} and \code{getBioRegion()} is that ##' \code{getBioRegion()} get the region object from \code{txdb} object but ##' \code{makeBioRegionFromGranges()} get the region from the granges object provided by users. ##' For example, \code{txdb} object do not contain insulator or enhancer regions. Users can ##' provide these regions through self-made granges object ##' ##' There are three kinds of regions, \code{start_site}, \code{end_site} and \code{body}. ##' ##' We take enhancer region to explain the differences of these three regions. ##' enhancer: chr1 1000 1400. ##' ##' \code{body} region refers to the 1000-1400bp. ##' ##' \code{start_site} region with \code{upstream = 100, downstream = 100} refers to 900-1100bp. ##' ##' \code{end_site} region with \code{upstream = 100, downstream = 100} refers to 1300-1500bp. ##' ##' In \code{makeBioRegionFromGranges()}, \code{upstream} and \code{downstream} can be ##' \code{NULL} if the \code{type == 'body'}. \code{by} should be specified by users and ##' can not be omitted. \code{by} parameter will be used to made labels. \code{type} should also ##' be specified. ##' ##' \url{https://github.com/YuLab-SMU/ChIPseeker/issues/189} ##' ##' @title makeBioRegionFromGranges ##' ##' @param gr a grange object contain region of interest ##' @param upstream upstream from start site or end site, can be NULL if the type == 'body' ##' @param downstream downstream from start site or end site, can be NULL if the type == 'body' ##' @param by specify be users, e.g. gene, insulator, enhancer ##' @param type one of "start_site", "end_site", "body" ##' @return GRanges object ##' @import BiocGenerics IRanges GenomicRanges ##' @export makeBioRegionFromGranges <- function(gr, by, type, upstream=1000, downstream=1000){ if (!is(gr, "GRanges")) { stop("windows should be a GRanges object...") } type <- match.arg(type, c("start_site", "end_site", "body")) label <- make_label(type = type, by = by) regions <- gr if(type == "start_site"){ coordinate<- ifelse(strand(regions) == "+", start(regions), end(regions)) }else if(type == "end_site"){ coordinate<- ifelse(strand(regions) == "+", end(regions), start(regions)) }else{ ## assign attribute attr(regions, 'type') = type attr(regions, 'by') = by attr(regions, 'label') = label return(regions) } ## issue and code obtained from Chen Ting(NIH/NCI) start_site <- ifelse(strand(regions) == "+",coordinate-upstream, coordinate-downstream) end_site <- ifelse(strand(regions) == "+", coordinate+downstream, coordinate+upstream) bioRegion <- GRanges(seqnames=seqnames(regions), ranges=IRanges(start_site, end_site), strand=strand(regions)) bioRegion <- unique(bioRegion) ## assign attribute attr(bioRegion, 'type') = type attr(bioRegion, 'by') = by attr(bioRegion, 'label') = label attr(bioRegion, 'upstream') = upstream attr(bioRegion, 'downstream') = downstream return(bioRegion) } ##' calculate the tag matrix ##' ##' \code{getTagMatrix()} function can produce the matrix for visualization. ##' \code{peak} stands for the peak file. ##' \code{window} stands for a collection of regions that users want to look into. ##' Users can use \code{window} to capture the peak of interest. ##' There are two ways to input \code{window}. ##' ##' The first way is that users can use ##' \code{getPromoters()/getBioRegion()/makeBioRegionFromGranges()} to ##' get \code{window} and put it into \code{getTagMatrix()}. ##' ##' The second way is that users can use \code{getTagMatrix()} to ##' call \code{getPromoters()/getBioRegion()/makeBioRegionFromGranges()}. In this way ##' users do not need to input \code{window} parameter but they need to input ##' \code{txdb}. ##' ##' \code{txdb} is a set of packages contained annotation ##' of regions of different genomes. Users can ##' get the regions of interest through specific functions. These specific functions ##' are built in \code{getPromoters()/getBioRegion()}. Many regions can not be gain ##' through \code{txdb}, like insulator and enhancer regions. ##' Users can provide these regions in the form of granges object. ##' These self-made granges object will be passed to \code{TxDb} parameter and they will ##' be passed to \code{makeBioRegionFromGranges()} to produce the \code{window}. ##' In a word, \code{TxDb} parameter is a reference information. Users can ##' pass \code{txdb object} or self-made granges into it. ##' ##' Details see \code{\link{getPromoters}},\code{\link{getBioRegion}} and \code{\link{makeBioRegionFromGranges}} ##' ##' \code{upstream} and \code{downstream} parameter have different usages: ##' ##' (1) \code{window} parameter is provided, ##' ##' if \code{type == 'body'}, \code{upstream} and \code{downstream} can use to extend ##' the flank of body region. ##' ##' if \code{type == 'start_site'/'end_site'}, \code{upstream} and \code{downstream} do not ##' play a role in \code{getTagMatrix()} function. ##' ##' (2) \code{window} parameter is missing, ##' ##' if \code{type == 'body'}, \code{upstream} and \code{downstream} can use to extend ##' the flank of body region. ##' ##' if \code{type == 'start_site'/'end_site'}, \code{upstream} and \code{downstream} refer to ##' the upstream and downstream of the start_site or the end_site. ##' ##' \code{weightCol} refers to column in peak file. This column acts as a weight vaule. Details ##' see \url{https://github.com/YuLab-SMU/ChIPseeker/issues/15} ##' ##' \code{nbin} refers to the number of bins. \code{getTagMatrix()} provide a binning method ##' to get the tag matrix. ##' ##' @title getTagMatrix ##' ##' @param peak peak peak file or GRanges object ##' @param upstream the distance of upstream extension ##' @param downstream the distance of downstream extension ##' @param windows a collection of region ##' @param type one of "start_site", "end_site", "body" ##' @param by one of 'gene', 'transcript', 'exon', 'intron', '3UTR' , '5UTR', or specified by users ##' @param TxDb TxDb or self-made granges object, served as txdb ##' @param weightCol column name of weight, default is NULL ##' @param nbin the amount of nbines ##' @param verbose print message or not ##' @param ignore_strand ignore the strand information or not ##' @return tagMatrix ##' @importFrom ggplot2 rel ##' @export getTagMatrix <- function(peak, upstream, downstream, windows, type, by, TxDb=NULL, weightCol = NULL, nbin = NULL, verbose = TRUE, ignore_strand= FALSE){ is_GRanges_of_TxDb <- FALSE if (is(TxDb, "GRanges")) { is_GRanges_of_TxDb <- TRUE message("#\n#.. 'TxDb' is a self-defined 'GRanges' object...\n#") } if(missingArg(windows)){ if(is_GRanges_of_TxDb){ ## make windows from self-made granges object windows <- makeBioRegionFromGranges(gr=TxDb, by=by, type=type, upstream=upstream, downstream=downstream) }else{ ## make windows from txdb object windows <- getBioRegion(TxDb=TxDb, upstream=upstream, downstream=downstream, by=by, type=type) } }else{ if (!is(windows, "GRanges")) { stop("windows should be a GRanges object...") } if(is.null(attr(windows,'type'))){ stop("windows should be made from getPromoters()/getBioRegion()/makeBioRegionFromGranges()") } type <- attr(windows, 'type') by <- attr(windows, 'by') } # check the upstream and downstream parameter if(type == "body"){ if(missingArg(upstream)){ upstream <- NULL } if(missingArg(downstream)){ downstream <- NULL } }else{ upstream <- attr(windows, 'upstream') downstream <- attr(windows, 'downstream') } ## check upstream and downstream parameter check_upstream_and_downstream(upstream = upstream, downstream = downstream) if(type != 'body'){ if(inherits(upstream, 'rel') || is.null(upstream)){ stop("upstream and downstream for site region should be actual number...") } } ## check nbin parameters if(!is.null(nbin) && !is.numeric(nbin)){ stop('nbin should be NULL or numeric...') } if(type == 'body' && is.null(nbin)){ stop('plotting body region should set the nbin parameter...') } ## check nbin parameter if(!is.null(nbin)){ cat(">> binning method is used...", format(Sys.time(), "%Y-%m-%d %X"), "\n",sep = "") is.binning <- TRUE }else{ is.binning <- FALSE } if (verbose) { cat(">> preparing ",type," regions"," by ",by,"... ", format(Sys.time(), "%Y-%m-%d %X"), "\n",sep = "") } if(is.binning){ if (verbose) { cat(">> preparing tag matrix by binning... ", format(Sys.time(), "%Y-%m-%d %X"), "\n") } tagMatrix <- getTagMatrix.binning.internal(peak = peak, weightCol = weightCol, windows = windows, nbin = nbin, upstream = upstream, downstream = downstream, ignore_strand = ignore_strand) }else{ if (verbose) { cat(">> preparing tag matrix... ", format(Sys.time(), "%Y-%m-%d %X"), "\n") } tagMatrix <- getTagMatrix.internal(peak=peak, weightCol=weightCol, windows=windows, ignore_strand=ignore_strand) } ## assign attribute attr(tagMatrix, 'upstream') = upstream attr(tagMatrix, 'downstream') = downstream attr(tagMatrix, 'type') = attr(windows, 'type') attr(tagMatrix, 'label') = attr(windows, 'label') attr(tagMatrix, "is.binning") <- is.binning return(tagMatrix) } ##' calculate the tag matrix ##' ##' ##' @title getTagMatrix.internal ##' @param peak peak file or GRanges object ##' @param weightCol column name of weight, default is NULL ##' @param windows a collection of region with equal size, eg. promoter region. ##' @param ignore_strand ignore the strand information or not ##' @return tagMatrix ##' @import BiocGenerics S4Vectors IRanges GenomeInfoDb GenomicRanges ##' @author G Yu getTagMatrix.internal <- function(peak, weightCol=NULL, windows, ignore_strand= FALSE) { peak.gr <- loadPeak(peak) if (! is(windows, "GRanges")) { stop("windows should be a GRanges object...") } if (length(unique(width(windows))) != 1) { stop("width of windows should be equal...") } ## if (!exists("ChIPseekerEnv", envir = .GlobalEnv)) { ## assign("ChIPseekerEnv", new.env(), .GlobalEnv) ## } ## ChIPseekerEnv <- get("ChIPseekerEnv", envir = .GlobalEnv) ## if (exists("peak", envir=ChIPseekerEnv, inherits=FALSE) && ## exists("promoters", envir=ChIPseekerEnv, inherits=FALSE) && ## exists("weightCol", envir=ChIPseekerEnv, inherits=FALSE) && ## exists("tagMatrix", envir=ChIPseekerEnv, inherits=FALSE) ) { ## pp <- get("peak", envir=ChIPseekerEnv) ## promoters <- get("promoters", envir=ChIPseekerEnv) ## w <- get("weightCol", envir=ChIPseekerEnv) ## if (all(pp == peak)) { ## if (all(windows == promoters)) { ## if ( (is.null(w) && is.null(weightCol)) || ## (!is.null(w) && !is.null(weightCol) && w == weightCol)) { ## tagMatrix <- get("tagMatrix", envir=ChIPseekerEnv) ## return(tagMatrix) ## } else { ## assign("weightCol", weightCol, envir=ChIPseekerEnv) ## } ## } else { ## assign("promoters", windows) ## ## make sure it is not conflict with getPromoters ## if ( exists("upstream", envir=ChIPseekerEnv, inherits=FALSE)) ## rm("upstream", envir=ChIPseekerEnv) ## } ## } else { ## assign("peak", peak, envir=ChIPseekerEnv) ## } ## } ## if ( !exists("peak", envir=ChIPseekerEnv, inherits=FALSE)) { ## assign("peak", peak, envir=ChIPseekerEnv) ## } ## if ( !exists("promoters", envir=ChIPseekerEnv, inherits=FALSE)) { ## assign("promoters", windows, envir=ChIPseekerEnv) ## } ## if (!exists("weightCol", envir=ChIPseekerEnv, inherits=FALSE)) { ## assign("weightCol", weightCol, envir=ChIPseekerEnv) ## } if (is.null(weightCol)) { peak.cov <- coverage(peak.gr) } else { weight <- mcols(peak.gr)[[weightCol]] peak.cov <- coverage(peak.gr, weight=weight) } cov.len <- elementNROWS(peak.cov) cov.width <- GRanges(seqnames=names(cov.len), IRanges(start=rep(1, length(cov.len)), end=cov.len)) windows <- subsetByOverlaps(windows, cov.width, type="within", ignore.strand=FALSE) chr.idx <- intersect(names(peak.cov), unique(as.character(seqnames(windows)))) peakView <- Views(peak.cov[chr.idx], as(windows, "IntegerRangesList")[chr.idx]) tagMatrixList <- lapply(peakView, function(x) t(viewApply(x, as.vector))) tagMatrix <- do.call("rbind", tagMatrixList) ## get the index of windows, that are reorganized by as(windows, "IntegerRangesList") idx.list <- split(1:length(windows), as.factor(seqnames(windows))) idx <- do.call("c", idx.list) rownames(tagMatrix) <- idx tagMatrix <- tagMatrix[order(idx),] ## minus strand if (!ignore_strand) { minus.idx <- which(as.character(strand(windows)) == "-") tagMatrix[minus.idx,] <- tagMatrix[minus.idx, ncol(tagMatrix):1] } tagMatrix <- tagMatrix[rowSums(tagMatrix)!=0,] ## assign("tagMatrix", tagMatrix, envir=ChIPseekerEnv) return(tagMatrix) } ##' calculate the tagMatrix by binning ##' the idea was derived from the function of deeptools ##' https://deeptools.readthedocs.io/en/develop/content/tools/computeMatrix.html ##' ##' @title getTagMatrix.binning.internal ##' @param peak peak peak file or GRanges object ##' @param weightCol weightCol column name of weight, default is NULL ##' @param windows windows a collection of region with equal or not equal size, eg. promoter region, gene region. ##' @param nbin the amount of nbines needed to be splited and it should not be more than min_body_length ##' @param upstream rel object, NULL or actual number ##' @param downstream rel object, NULL or actual number ##' @param ignore_strand ignore the strand information or not ##' @import BiocGenerics S4Vectors IRanges GenomeInfoDb GenomicRanges ##' @importFrom ggplot2 rel ##' @return tagMatrix getTagMatrix.binning.internal <- function(peak, weightCol = NULL, windows, nbin = 800, upstream = NULL, downstream = NULL, ignore_strand = FALSE){ min_body_length <- filter_length <- nbin peak.gr <- loadPeak(peak) type <- attr(windows, 'type') if (!is(windows, "GRanges")) { stop("windows should be a GRanges object...") } if (is.null(weightCol)) { peak.cov <- coverage(peak.gr) } else { weight <- mcols(peak.gr)[[weightCol]] peak.cov <- coverage(peak.gr, weight=weight) } cov.len <- elementNROWS(peak.cov) cov.width <- GRanges(seqnames=names(cov.len), IRanges(start=rep(1, length(cov.len)), end=cov.len)) windows <- subsetByOverlaps(windows, cov.width, type="within", ignore.strand=FALSE) ## extend the windows by rel object if(inherits(upstream, 'rel')){ windows1 <- windows if(!ignore_strand){ positive_index <- which(as.character(strand(windows1)) == "+") negative_index <- which(as.character(strand(windows1)) == "-") start(windows1)[positive_index] <- suppressWarnings(start(windows1)[positive_index] - floor(width(windows)[positive_index]*as.numeric(upstream))) end(windows1)[positive_index] <- suppressWarnings(end(windows1)[positive_index] + floor(width(windows)[positive_index]*as.numeric(downstream))) start(windows1)[negative_index] <- suppressWarnings(start(windows1)[negative_index] - floor(width(windows)[negative_index]*as.numeric(downstream))) end(windows1)[negative_index] <- suppressWarnings(end(windows1)[negative_index] + floor(width(windows)[negative_index]*as.numeric(upstream))) }else{ start(windows1) <- suppressWarnings(start(windows1) - floor(width(windows)*as.numeric(upstream))) end(windows1) <- suppressWarnings(end(windows1) + floor(width(windows)*as.numeric(downstream))) } windows <- windows1 nbin <- floor(nbin*(1+as.numeric(downstream)+as.numeric(upstream))) min_body_length <- min_body_length*(1+as.numeric(upstream)+as.numeric(downstream)) cat(">> preparing matrix with extension from (",attr(windows,'label')[1],"-", 100*as.numeric(upstream),"%)~(",attr(windows,'label')[2],"+", 100*as.numeric(downstream),"%)... ", format(Sys.time(), "%Y-%m-%d %X"),"\n",sep = "") } ## do not extend if(is.null(upstream)){ if(attr(windows, 'type') == 'body'){ cat(">> preparing matrix for ",attr(windows, 'type')," region with no flank extension... ", format(Sys.time(), "%Y-%m-%d %X"),"\n",sep = "") }else{ cat(">> preparing matrix for ",attr(windows,'type')," region... ", format(Sys.time(), "%Y-%m-%d %X"),"\n",sep = "") } } ## extend the windows by actual number if(!is.null(upstream) && !inherits(upstream, 'rel') && attr(windows, 'type')== 'body'){ windows1 <- windows if(!ignore_strand){ positive_index <- which(as.character(strand(windows1)) == "+") negative_index <- which(as.character(strand(windows1)) == "-") start(windows1)[positive_index] <- suppressWarnings(start(windows1)[positive_index] - upstream) end(windows1)[positive_index] <- suppressWarnings(end(windows1)[positive_index] + downstream) start(windows1)[negative_index] <- suppressWarnings(start(windows1)[negative_index] - downstream) end(windows1)[negative_index] <- suppressWarnings(end(windows1)[negative_index] + upstream) }else{ start(windows1) <- suppressWarnings(start(windows1) - upstream) end(windows1) <- suppressWarnings(end(windows1) + downstream) } windows <- windows1 upstreamPer <- floor(upstream/1000)*0.1 downstreamPer <- floor(downstream/1000)*0.1 nbin <- floor(nbin*(1+upstreamPer+downstreamPer)) min_body_length <- min_body_length+upstream+downstream cat(">> preparing matrix with flank extension from (",attr(windows,'label')[1],"-", upstream,"bp)~(",attr(windows,'label')[2],"+",downstream,"bp)... ", format(Sys.time(), "%Y-%m-%d %X"),"\n",sep = "") } chr.idx <- intersect(names(peak.cov), unique(as.character(seqnames(windows)))) windows <- as(windows, "IntegerRangesList")[chr.idx] attr(windows,'type') <- type peakView <- Views(peak.cov[chr.idx], windows) ## remove the gene that has no binding proteins for (i in 1:length(peakView)) { index <- viewSums(peakView[[i]])!= 0 peakView[[i]] <- peakView[[i]][index] windows[[i]] <- windows[[i]][index] } tagMatrixList <- lapply(peakView, function(x) viewApply(x, as.vector)) if(!attr(windows, 'type') == 'body'){ tagMatrixList <- lapply(tagMatrixList, function(x) t(x)) # to remove the chromosome that do not bind protein index <- vapply(tagMatrixList, function(x) length(x)>0, FUN.VALUE = logical(1)) tagMatrixList <- tagMatrixList[index] windows <- windows[index] ## create a matrix to receive binning results tagMatrix <- list() ## this circulation is to deal with different chromosomes for (i in 1:length(tagMatrixList)) { tagMatrix[[i]] <- matrix(nrow = nrow(tagMatrixList[[i]]),ncol = nbin) ## this circulation is to deal with different genes for (j in 1:nrow(tagMatrixList[[i]])) { ## seq is the distance between different bins seq <- floor(length(tagMatrixList[[i]][j,])/nbin) ## cursor record the position of calculation cursor <- 1 ## the third circulation is to calculate the binding strength ## it has two parts ## the first part is to for the nbin(1:nbin-1) ## because the seq is not derived from exact division ## the second part is to compensate the loss of non-exact-division ## this the first part for 1:(nbin-1) for (k in 1:(nbin-1)) { read <- 0 for (z in cursor:(cursor+seq-1)) { read <- read + tagMatrixList[[i]][j,z] } tagMatrix[[i]][j,k] <- read/seq cursor <- cursor+seq } ## this the second part to to compensate the loss of non-exact-division read <- 0 for (z in cursor:length(tagMatrixList[[i]][j,])) { read <- read+tagMatrixList[[i]][j,z] } tagMatrix[[i]][j,nbin] <- read/(length(tagMatrixList[[i]][j,])-cursor+1) } if(!ignore_strand){ minus.idx <- which(as.character(mcols(windows[[i]])[["strand"]]) == "-") tagMatrix[[i]][minus.idx,] <- tagMatrix[[i]][minus.idx, ncol(tagMatrix[[i]]):1] } } }else{ ## extend genebody by atual number if(!is.null(upstream) & !inherits(upstream, 'rel')){ for (i in 1:length(tagMatrixList)) { if (length(class(tagMatrixList[[i]])) != 1) { sample <- tagMatrixList[[i]] tagMatrixList[[i]] <- lapply(seq_len(ncol(sample)), function(i) sample[,i]) } } index <- vapply(tagMatrixList, function(x) length(x)>0, FUN.VALUE = logical(1)) tagMatrixList <- tagMatrixList[index] windows <- windows[index] ## count the amount before filtering pre_amount <- 0 for(i in 1:length(tagMatrixList)){ pre_amount <- pre_amount+length(tagMatrixList[[i]]) } for (i in 1:length(tagMatrixList)) { index <- vapply(tagMatrixList[[i]], function(y) length(y)>min_body_length,FUN.VALUE = logical(1)) tagMatrixList[[i]] <- tagMatrixList[[i]][index] windows[[i]] <- windows[[i]][index] } ## count the amount after filtering amount <- 0 for(i in 1:length(tagMatrixList)){ amount <- amount+length(tagMatrixList[[i]]) } cat(">> ",pre_amount-amount," peaks(",100*((pre_amount-amount)/pre_amount), "%), having lengths smaller than ",filter_length,"bp, are filtered... ", format(Sys.time(), "%Y-%m-%d %X"),"\n",sep = "") upstreamnbin <- floor(nbin*(upstreamPer/(1+upstreamPer+downstreamPer))) bodynbin <- floor(nbin*(1/(1+upstreamPer+downstreamPer))) downstreamnbin <- floor(nbin*(downstreamPer/(1+upstreamPer+downstreamPer))) tagMatrix <- list() for (i in 1:length(tagMatrixList)) { tagMatrix[[i]] <- matrix(nrow = length(tagMatrixList[[i]]),ncol = nbin) ## count the upstream for (j in 1:length(tagMatrixList[[i]])) { seq <- floor(upstream/upstreamnbin) cursor <- 1 for (k in 1:(upstreamnbin-1)) { read <- 0 for (z in cursor:(cursor+seq-1)) { read <- read + tagMatrixList[[i]][[j]][z] } tagMatrix[[i]][j,k] <- read/seq cursor <- cursor+seq } read <- 0 for (z in cursor:upstream) { read <- read+tagMatrixList[[i]][[j]][z] } tagMatrix[[i]][j,upstreamnbin] <- read/(upstream-cursor) } ## count genebody for (j in 1:length(tagMatrixList[[i]])) { seq <- floor((length(tagMatrixList[[i]][[j]])-upstream-downstream)/bodynbin) cursor <- upstream+1 for (k in (upstreamnbin+1):(upstreamnbin+bodynbin-1)) { read <- 0 for (z in cursor:(cursor+seq-1)) { read <- read + tagMatrixList[[i]][[j]][z] } tagMatrix[[i]][j,k] <- read/seq cursor <- cursor+seq } read <- 0 for (z in cursor:(length(tagMatrixList[[i]][[j]])-downstream)) { read <- read+tagMatrixList[[i]][[j]][z] } tagMatrix[[i]][j,bodynbin+upstreamnbin] <- read/(length(tagMatrixList[[i]][[j]])-downstream-cursor) } ## count downstream for (j in 1:length(tagMatrixList[[i]])) { seq <- floor(downstream/downstreamnbin) cursor <- length(tagMatrixList[[i]][[j]])-downstream+1 for (k in (upstreamnbin+bodynbin+1):(nbin-1)) { read <- 0 for (z in cursor:(cursor+seq-1)) { read <- read + tagMatrixList[[i]][[j]][z] } tagMatrix[[i]][j,k] <- read/seq cursor <- cursor+seq } read <- 0 for (z in cursor:length(tagMatrixList[[i]][[j]])) { read <- read+tagMatrixList[[i]][[j]][z] } tagMatrix[[i]][j,nbin] <- read/(length(tagMatrixList[[i]][[j]])-cursor+1) } if(!ignore_strand){ minus.idx <- which(as.character(mcols(windows[[i]])[["strand"]]) == "-") tagMatrix[[i]][minus.idx,] <- tagMatrix[[i]][minus.idx, ncol(tagMatrix[[i]]):1] } } }else{ for (i in 1:length(tagMatrixList)) { if (length(class(tagMatrixList[[i]])) != 1) { sample <- tagMatrixList[[i]] tagMatrixList[[i]] <- lapply(seq_len(ncol(sample)), function(i) sample[,i]) } } index <- vapply(tagMatrixList, function(x) length(x)>0, FUN.VALUE = logical(1)) tagMatrixList <- tagMatrixList[index] windows <- windows[index] ## count the amount before filtering pre_amount <- 0 for(i in 1:length(tagMatrixList)){ pre_amount <- pre_amount+length(tagMatrixList[[i]]) } for (i in 1:length(tagMatrixList)) { index <- vapply(tagMatrixList[[i]], function(y) length(y)>min_body_length,FUN.VALUE = logical(1)) tagMatrixList[[i]] <- tagMatrixList[[i]][index] windows[[i]] <- windows[[i]][index] } ## count the amount after filtering amount <- 0 for(i in 1:length(tagMatrixList)){ amount <- amount+length(tagMatrixList[[i]]) } cat(">> ",pre_amount-amount," peaks(",100*((pre_amount-amount)/pre_amount), "%), having lengths smaller than ",filter_length,"bp, are filtered... ", format(Sys.time(), "%Y-%m-%d %X"),"\n",sep = "") tagMatrix <- list() for (i in 1:length(tagMatrixList)) { tagMatrix[[i]] <- matrix(nrow = length(tagMatrixList[[i]]),ncol = nbin) for (j in 1:length(tagMatrixList[[i]])) { seq <- floor(length(tagMatrixList[[i]][[j]])/nbin) cursor <- 1 for (k in 1:(nbin-1)) { read <- 0 for (z in cursor:(cursor+seq-1)) { read <- read + tagMatrixList[[i]][[j]][z] } tagMatrix[[i]][j,k] <- read/seq cursor <- cursor+seq } read <- 0 for (z in cursor:length(tagMatrixList[[i]][[j]])) { read <- read+tagMatrixList[[i]][[j]][z] } tagMatrix[[i]][j,nbin] <- read/(length(tagMatrixList[[i]][[j]])-cursor+1) } if(!ignore_strand){ minus.idx <- which(as.character(mcols(windows[[i]])[["strand"]]) == "-") tagMatrix[[i]][minus.idx,] <- tagMatrix[[i]][minus.idx, ncol(tagMatrix[[i]]):1] } } } } ## combine the results tagMatrix <- do.call("rbind",tagMatrix) return(tagMatrix) } ##' Nested function for getTagMatrix() to deal with multiple windows ##' ##' This is an internal function. ##' @title getTagMatrix2 ##' ##' @param peak peak peak file or GRanges object ##' @param upstream the distance of upstream extension ##' @param downstream the distance of downstream extension ##' @param windows_name the names of windows ##' @param type one of "start_site", "end_site", "body" ##' @param by one of 'gene', 'transcript', 'exon', 'intron', '3UTR' , '5UTR', or specified by users ##' @param TxDb TxDb or self-made granges object, served as txdb ##' @param weightCol column name of weight, default is NULL ##' @param nbin the amount of nbines ##' @param verbose print message or not ##' @param ignore_strand ignore the strand information or not ##' @return tagMatrix ##' @importFrom ggplot2 rel getTagMatrix2 <- function(peak, upstream, downstream, windows_name, type, by, TxDb=NULL, weightCol = NULL, nbin = NULL, verbose = TRUE, ignore_strand= FALSE){ names(TxDb) <- by windows <- lapply(as.list(by), function(x){ if(x %in% c('gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR', 'UTR')){ result <- getBioRegion(TxDb=TxDb[[x]], upstream=upstream, downstream=downstream, by=x, type=type) }else{ result <- makeBioRegionFromGranges(gr=TxDb[[x]], by=x, type=type, upstream=upstream, downstream=downstream) } return(result) }) names(windows) <- windows_name # check the upstream and downstream parameter for body if(type == "body"){ if(missingArg(upstream)){ upstream <- NULL } if(missingArg(downstream)){ downstream <- NULL } }else{ upstream <- attr(windows[[1]], 'upstream') downstream <- attr(windows[[1]], 'downstream') } ## check upstream and downstream parameter check_upstream_and_downstream(upstream = upstream, downstream = downstream) if(type != 'body'){ if(inherits(upstream, 'rel') || is.null(upstream)){ stop("upstream and downstream for site region should be actual number...") } } ## check nbin parameters if(!is.null(nbin) && !is.numeric(nbin)){ stop('nbin should be NULL or numeric...') } if(type == 'body' && is.null(nbin)){ stop('plotting body region should set the nbin parameter...') } ## check nbin parameter if(!is.null(nbin)){ cat(">> binning method is used...", format(Sys.time(), "%Y-%m-%d %X"), "\n",sep = "") is.binning <- TRUE }else{ is.binning <- FALSE } if (verbose) { cat(">> preparing ",type," regions"," by ",paste(by,collapse = " "),"... ", format(Sys.time(), "%Y-%m-%d %X"), "\n",sep = "") } if(is.binning){ if (verbose) { cat(">> preparing tag matrix by binning... ", format(Sys.time(), "%Y-%m-%d %X"), "\n") } tagMatrix <- getTagMatrix2.binning.internal(peak = peak, weightCol = weightCol, windows = windows, windows_name=windows_name, nbin = nbin, upstream = upstream, downstream = downstream, ignore_strand = ignore_strand) }else{ if (verbose) { cat(">> preparing tag matrix... ", format(Sys.time(), "%Y-%m-%d %X"), "\n") } tagMatrix <- getTagMatrix2.internal(peak=peak, weightCol=weightCol, windows=windows, windows_name=windows_name, ignore_strand=ignore_strand) } names(tagMatrix) <- windows_name ## assign attribute tagMatrix <- lapply(tagMatrix, function(x){ attr(x, 'upstream') = upstream attr(x, 'downstream') = downstream attr(x, 'type') = attr(windows[[1]], 'type') attr(x, 'label') = attr(windows[[1]], 'label') attr(x, "is.binning") <- is.binning return(x) }) return(tagMatrix) } ##' @title getTagMatrix2.internal ##' ##' @param peak peak peak file or GRanges object ##' @param windows a collection of region ##' @param windows_name the name of windows ##' @param weightCol column name of weight, default is NULL ##' @param ignore_strand ignore the strand information or not getTagMatrix2.internal <- function(peak, weightCol=NULL, windows, windows_name, ignore_strand= FALSE) { mt_list <- lapply(windows_name, function(x){ windows_tmp <- windows[[x]] mt <- getTagMatrix.internal(peak=peak, weightCol=weightCol, windows=windows_tmp, ignore_strand=ignore_strand) return(mt) }) return(mt_list) } ##' internal function ##' ##' @param peak peak peak file or GRanges object ##' @param upstream the distance of upstream extension ##' @param downstream the distance of downstream extension ##' @param windows a collection of region ##' @param windows_name the name of windows ##' @param weightCol column name of weight, default is NULL ##' @param nbin the amount of nbines ##' @param ignore_strand ignore the strand information or not getTagMatrix2.binning.internal <- function(peak, weightCol = NULL, windows, windows_name, nbin = 800, upstream = NULL, downstream = NULL, ignore_strand = FALSE){ mt_list <- lapply(windows_name, function(x){ windows_tmp <- windows[[x]] mt <- getTagMatrix.binning.internal(peak = peak, weightCol = weightCol, windows = windows_tmp, nbin = nbin, upstream = upstream, downstream = downstream, ignore_strand = ignore_strand) return(mt) }) return(mt_list) } ================================================ FILE: R/upsetplot.R ================================================ ## @importFrom UpSetR upset ## @importFrom grid viewport ## @importFrom grid pushViewport ## @importFrom grid popViewport ## @importFrom gridBase gridPLT ## @importFrom graphics plot.new ##' @importFrom ggplot2 coord_fixed ##' @importFrom ggplot2 ggplot ##' @importFrom ggplot2 aes_ ##' @importFrom ggplot2 geom_bar ##' @importFrom ggplot2 xlab ##' @importFrom ggplot2 ylab ##' @importFrom ggplot2 theme_minimal ##' @author Guangchuang Yu upsetplot.csAnno <- function(x, order_by = "freq", vennpie=FALSE, vp = list(x=.6, y=.7, width=.8, height=.8)) { y <- x@detailGenomicAnnotation nn <- names(y) y <- as.matrix(y) res <- tibble::tibble(anno = lapply(1:nrow(y), function(i) nn[y[i,]])) g <- ggplot(res, aes_(x = ~anno)) + geom_bar() + xlab(NULL) + ylab(NULL) + theme_minimal() + ggupset::scale_x_upset(n_intersections = 20, order_by = order_by) if (!vennpie) return(g) f <- function() vennpie(x, cex = .9) p <- ggplotify::as.ggplot(f) + coord_fixed() ggplotify::as.ggplot(g) + ggimage::geom_subview(subview = p, x = vp$x, y = vp$y, width = vp$width, height = vp$height) ## y[y] <- 1 ## y <- as.data.frame(y) ## ## cn <- colnames(y) ## ## cn[cn == "fiveUTR"] <- "5 UTR" ## ## cn[cn == "threeUTR"] <- "3 UTR" ## ## colnames(y) <- cn ## if (is.null(sets)) { ## sets <- c("distal_intergenic", "downstream", ## "threeUTR", "fiveUTR", "Intron", ## "Exon", "Promoter") ## if (vennpie && is.null(sets.bar.color)) { ## sets.bar.color <- c("#d95f0e", "#fee0d2", "#98D277", ## "#6F9E4C", "#fc9272", "#9ecae1", "#ffeda0") ## } ## } ## if (is.null(sets.bar.color)) { ## sets.bar.color <- "black" ## } ## if (vennpie) { ## plot.new() ## # grid.rect(gp = gpar(fill="white")) ## upset(y, sets=sets, sets.bar.color=sets.bar.color, ## order.by = order.by, ...) ## pushViewport(vp) ## ##par(plt=gridPLT(), new=TRUE) ## vennpie(x) ## popViewport() ## } else { ## upset(y, sets=sets,sets.bar.color=sets.bar.color, ## order.by = order.by, ...) ## } } ================================================ FILE: R/utilities.R ================================================ #' @title env function for ChIPseeker #' @param TxDb txdb object #' @param item item name #' @param force force to update txdb item in cache or not. #' @importFrom yulab.utils get_cache_item #' @importFrom yulab.utils update_cache_item #' @importFrom yulab.utils rm_cache_item #' @importFrom yulab.utils initial_cache_item #' @importFrom S4Vectors metadata .ChIPseekerEnv <- function(TxDb, item = "ChIPseekerEnv", force = FALSE) { # get cache item # it will create a list if there is no a cache item cache_item <- get_cache_item(item) # if there is no TXDB cached, write in cache if (is.null(cache_item$TXDB)) { update_cache_item(item = item, list(TXDB = TxDb)) cat(">> Using Genome:", get_env_genome(),"...\n") return(invisible(NULL)) } # force to update item if(force){ cat(">> Force to update txdb in cache...\n") rm_cache_item(item) initial_cache_item(item) update_cache_item(item, list(TXDB = TxDb)) cat(">> Using Genome:", get_env_genome(),"...\n") } # if exist TXDB TXDB <- cache_item$TXDB m1 <- tryCatch(unlist(metadata(TXDB)), error = function(e) NULL) m2 <- tryCatch(unlist(metadata(TxDb)), error = function(e) NULL) if (!is.null(m1)) m1 <- m1[!is.na(m1)] if (!is.null(m2)) m2 <- m2[!is.na(m2)] txdb_flag <- is.character(all.equal(TXDB, TxDb)) if (is.null(m1) || is.null(m2) || length(m1) != length(m2) || any(m1 != m2) || txdb_flag) { cat(">> Update txdb in cache...\n") rm_cache_item(item) initial_cache_item(item) update_cache_item(item, list(TXDB = TxDb)) } cat(">> Using Genome:", get_env_genome(),"...\n") invisible(NULL) # pos <- 1 # envir <- as.environment(pos) # if (!exists("ChIPseekerEnv", envir=.GlobalEnv)) { # assign("ChIPseekerEnv", new.env(), envir = envir) # } # ChIPseekerEnv <- get("ChIPseekerEnv", envir=.GlobalEnv) # if (!exists("TXDB", envir=ChIPseekerEnv, inherits=FALSE)) { # ## first run # assign("TXDB", TxDb, envir=ChIPseekerEnv) # } else { # TXDB <- get("TXDB", envir=ChIPseekerEnv) # m1 <- tryCatch(unlist(metadata(TXDB)), error=function(e) NULL) # m2 <- unlist(metadata(TxDb)) # if (!is.null(m1)) { # m1 <- m1[!is.na(m1)] # } # m2 <- m2[!is.na(m2)] # if ( is.null(m1) || length(m1) != length(m2) || any(m1 != m2) ) { # rm(ChIPseekerEnv) # assign("ChIPseekerEnv", new.env(), envir = envir) # ChIPseekerEnv <- get("ChIPseekerEnv", envir=.GlobalEnv) # assign("TXDB", TxDb, envir=ChIPseekerEnv) # } # } } ##' @importFrom GenomicFeatures exonsBy ##' @importFrom yulab.utils get_cache_element ##' @importFrom yulab.utils update_cache_item get_exonList <- function(item = "ChIPseekerEnv") { # TxDb <- get("TXDB", envir=ChIPseekerEnv) TxDb <- get_cache_element(item = item, elements = "TXDB") exonList <- get_cache_element(item = item, elements = "exonList") if(is.null(exonList)){ exonList <- exonsBy(TxDb) update_cache_item(item = item, list("exonList" = exonList)) } # if ( exists("exonList", envir=ChIPseekerEnv, inherits=FALSE) ) { # exonList <- get("exonList", envir=ChIPseekerEnv) # } else { # exonList <- exonsBy(TxDb) # assign("exonList", exonList, envir=ChIPseekerEnv) # } return(exonList) } ##' @importFrom GenomicFeatures intronsByTranscript ##' @importFrom yulab.utils get_cache_element ##' @importFrom yulab.utils update_cache_item get_intronList <- function(item = "ChIPseekerEnv") { # TxDb <- get("TXDB", envir=ChIPseekerEnv) TxDb <- get_cache_element(item = item, elements = "TXDB") intronList <- get_cache_element(item = item, elements = "intronList") if(is.null(intronList)){ intronList <- intronsByTranscript(TxDb) update_cache_item(item = item, list("intronList" = intronList)) } # if ( exists("intronList", envir=ChIPseekerEnv, inherits=FALSE) ) { # intronList <- get("intronList", envir=ChIPseekerEnv) # } else { # intronList <- intronsByTranscript(TxDb) # assign("intronList", intronList, envir=ChIPseekerEnv) # } return(intronList) } getCols <- function(n) { col <- c("#8dd3c7", "#ffffb3", "#bebada", "#fb8072", "#80b1d3", "#fdb462", "#b3de69", "#fccde5", "#d9d9d9", "#bc80bd", "#ccebc5", "#ffed6f") col2 <- c("#1f78b4", "#ffff33", "#c2a5cf", "#ff7f00", "#810f7c", "#a6cee3", "#006d2c", "#4d4d4d", "#8c510a", "#d73027", "#78c679", "#7f0000", "#41b6c4", "#e7298a", "#54278f") col3 <- c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a", "#ffff99", "#b15928") ## colorRampPalette(brewer.pal(12, "Set3"))(n) col3[1:n] } getPalette <- function(n){ palette <- c("RdBu", "RdYlGn", "Spectral", "RdYlBu", "PiYG", "PRGn", "PuOr", "BrBG", "RdGy") palette[1:n] } getSgn <- function(data, idx){ d <- data[idx, ] ss <- colSums(d) ss <- ss / sum(ss) return(ss) } parseBootCiPerc <- function(bootCiPerc){ bootCiPerc <- bootCiPerc$percent tmp <- length(bootCiPerc) ciLo <- bootCiPerc[tmp - 1] ciUp <- bootCiPerc[tmp] return(c(ciLo, ciUp)) } ## estimate CI using bootstraping ##' @importFrom boot boot ##' @importFrom boot boot.ci ##' @importFrom parallel detectCores getTagCiMatrix <- function(tagMatrix, conf = 0.95, resample=500, ncpus=detectCores()-1){ RESAMPLE_TIME <- resample trackLen <- ncol(tagMatrix) if (Sys.info()[1] == "Windows") { tagMxBoot <- boot(data = tagMatrix, statistic = getSgn, R = RESAMPLE_TIME) } else { tagMxBoot <- boot(data = tagMatrix, statistic = getSgn, R = RESAMPLE_TIME, parallel = "multicore", ncpus = ncpus) } cat(">> Running bootstrapping for tag matrix...\t\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") tagMxBootCi <- sapply(seq_len(trackLen), function(i) { bootCiToken <- boot.ci(tagMxBoot, type = "perc", index = i) ## parse boot.ci results return(parseBootCiPerc(bootCiToken)) } ) row.names(tagMxBootCi) <- c("Lower", "Upper") return(tagMxBootCi) } getTagCount <- function(tagMatrix, xlim, conf, ...) { ss <- colSums(tagMatrix) ss <- ss/sum(ss) ## plot(1:length(ss), ss, type="l", xlab=xlab, ylab=ylab) pos <- value <- NULL dd <- data.frame(pos=c(xlim[1]:xlim[2]), value=ss) if (!(missingArg(conf) || is.na(conf))){ tagCiMx <- getTagCiMatrix(tagMatrix, conf = conf, ...) dd$Lower <- tagCiMx["Lower", ] dd$Upper <- tagCiMx["Upper", ] } return(dd) } TXID2EG <- function(txid, geneIdOnly=FALSE) { txid <- as.character(txid) if (geneIdOnly == TRUE) { res <- TXID2EGID(txid) } else { res <- TXID2TXEG(txid) } return(res) } ##' @importFrom GenomicFeatures transcripts ##' @importFrom yulab.utils get_cache_element ##' @importFrom yulab.utils update_cache_item TXID2TXEG <- function(txid) { # ChIPseekerEnv <- get("ChIPseekerEnv", envir=.GlobalEnv) txid2geneid <- get_cache_element(item = ChIPseekerCache, elements = "txid2geneid") if(is.null(txid2geneid)){ txdb <- get_cache_element(item = ChIPseekerCache, elements = "TXDB") txidinfo <- transcripts(txdb, columns=c("tx_id", "tx_name", "gene_id")) idx <- which(sapply(txidinfo$gene_id, length) == 0) txidinfo[idx,]$gene_id <- txidinfo[idx,]$tx_name txid2geneid <- paste(mcols(txidinfo)[["tx_name"]], mcols(txidinfo)[["gene_id"]], sep="/") txid2geneid <- sub("/NA", "", txid2geneid) names(txid2geneid) <- mcols(txidinfo)[["tx_id"]] update_cache_item(item = ChIPseekerCache, list("txid2geneid" = txid2geneid)) } # if (exists("txid2geneid", envir=ChIPseekerEnv, inherits=FALSE)) { # txid2geneid <- get("txid2geneid", envir=ChIPseekerEnv) # } else { # txdb <- get("TXDB", envir=ChIPseekerEnv) # txidinfo <- transcripts(txdb, columns=c("tx_id", "tx_name", "gene_id")) # idx <- which(sapply(txidinfo$gene_id, length) == 0) # txidinfo[idx,]$gene_id <- txidinfo[idx,]$tx_name # txid2geneid <- paste(mcols(txidinfo)[["tx_name"]], # mcols(txidinfo)[["gene_id"]], # sep="/") # txid2geneid <- sub("/NA", "", txid2geneid) # names(txid2geneid) <- mcols(txidinfo)[["tx_id"]] # assign("txid2geneid", txid2geneid, envir=ChIPseekerEnv) # } return(as.character(txid2geneid[txid])) } ##' @importFrom yulab.utils get_cache_element ##' @importFrom yulab.utils update_cache_item TXID2EGID <- function(txid) { # ChIPseekerEnv <- get("ChIPseekerEnv", envir=.GlobalEnv) txid2geneid <- get_cache_element(item = ChIPseekerCache, elements = "txid2eg") if(is.null(txid2geneid)){ txdb <- get_cache_element(item = ChIPseekerCache, elements = "TXDB") txidinfo <- transcripts(txdb, columns=c("tx_id", "tx_name", "gene_id")) idx <- which(sapply(txidinfo$gene_id, length) == 0) txidinfo[idx,]$gene_id <- txidinfo[idx,]$tx_name txid2geneid <- as.character(mcols(txidinfo)[["gene_id"]]) names(txid2geneid) <- mcols(txidinfo)[["tx_id"]] update_cache_item(item = ChIPseekerCache, list("txid2eg" = txid2geneid)) } # if (exists("txid2eg", envir=ChIPseekerEnv, inherits=FALSE)) { # txid2geneid <- get("txid2eg", envir=ChIPseekerEnv) # } else { # txdb <- get("TXDB", envir=ChIPseekerEnv) # txidinfo <- transcripts(txdb, columns=c("tx_id", "tx_name", "gene_id")) # idx <- which(sapply(txidinfo$gene_id, length) == 0) # txidinfo[idx,]$gene_id <- txidinfo[idx,]$tx_name # txid2geneid <- as.character(mcols(txidinfo)[["gene_id"]]) # names(txid2geneid) <- mcols(txidinfo)[["tx_id"]] # assign("txid2eg", txid2geneid, envir=ChIPseekerEnv) # } return(as.character(txid2geneid[txid])) } ## according to: https://support.bioconductor.org/p/70432/#70545 ## contributed by Hervé Pagès getFirstHitIndex <- function(x) { ## sapply(unique(x), function(i) which(x == i)[1]) which(!duplicated(x)) } ##' calculate the overlap matrix, which is useful for vennplot ##' ##' ##' @title overlap ##' @param Sets a list of objects ##' @return data.frame ##' @importFrom gtools permutations ##' @export ##' @author G Yu overlap <- function(Sets) { ## this function is very generic. ## it call the getIntersectLength function to calculate ## the number of the intersection. ## if it fail, take a look at the object type were supported by getIntersectLength function. nn <- names(Sets) w <- t(apply(permutations(2,length(Sets),0:1, repeats.allowed=TRUE), 1 , rev)) rs <- rowSums(w) wd <- as.data.frame(w) wd$n <- NA for (i in length(nn):0) { idx <- which(rs == i) if (i == length(nn)) { len <- getIntersectLength(Sets, as.logical(w[idx,])) wd$n[idx] <- len } else if (i == 0) { wd$n[idx] <- 0 } else { for (ii in idx) { ##print(ii) len <- getIntersectLength(Sets, as.logical(w[ii,])) ww = w[ii,] jj <- which(ww == 0) pp <- permutations(2, length(jj), 0:1, repeats.allowed=TRUE) for (aa in 2:nrow(pp)) { ## 1st row is all 0, abondoned xx <- jj[as.logical(pp[aa,])] ww[xx] =ww[xx] +1 bb <- t(apply(w, 1, function(i) i == ww)) wd$n[rowSums(bb) == length(ww) ] ww <- w[ii,] len <- len - wd$n[rowSums(bb) == length(ww) ] ww <- w[ii,] } wd$n[ii] <- len } } } colnames(wd) = c(names(Sets), "Weight") return(wd) } getIntersectLength <- function(Sets, idx) { ## only use intersect and length methods in this function ## works fine with GRanges object ## and easy to extend to other objects. ss= Sets[idx] ol <- ss[[1]] if (sum(idx) == 1) { return(length(ol)) } for (j in 2:length(ss)) { ol <- intersect(ol, ss[[j]]) } return(length(ol)) } loadPeak <- function(peak, verbose=FALSE) { if (is(peak, "GRanges")) { peak.gr <- peak } else if (file.exists(peak)) { if (verbose) cat(">> loading peak file...\t\t\t\t", format(Sys.time(), "%Y-%m-%d %X"), "\n") peak.gr <- readPeakFile(peak, as="GRanges") } else { stop("peak should be GRanges object or a peak file...") } return(peak.gr) } ##' @importFrom TxDb.Hsapiens.UCSC.hg19.knownGene TxDb.Hsapiens.UCSC.hg19.knownGene loadTxDb <- function(TxDb) { if ( is.null(TxDb) ) { warning(">> TxDb is not specified, use 'TxDb.Hsapiens.UCSC.hg19.knownGene' by default...") TxDb <- TxDb.Hsapiens.UCSC.hg19.knownGene } return(TxDb) } ##' @importFrom AnnotationDbi get ##' @importFrom GenomicFeatures genes ##' @importFrom GenomicFeatures transcriptsBy ##' @importFrom yulab.utils get_cache_element ##' @importFrom yulab.utils update_cache_item getGene <- function(TxDb, by="gene") { .ChIPseekerEnv(TxDb, item = ChIPseekerCache) # ChIPseekerEnv <- get("ChIPseekerEnv", envir=.GlobalEnv) by <- match.arg(by, c("gene", "transcript")) if (by == "gene") { features <- get_cache_element(item = ChIPseekerCache, elements = "Genes") if(is.null(features)){ features <- suppressMessages(genes(TxDb)) update_cache_item(item = ChIPseekerCache, list("Genes" = features)) } # if ( exists("Genes", envir=ChIPseekerEnv, inherits=FALSE) ) { # features <- get("Genes", envir=ChIPseekerEnv) # } else { # features <- suppressMessages(genes(TxDb)) # assign("Genes", features, envir=ChIPseekerEnv) # } } else { features <- get_cache_element(item = ChIPseekerCache, elements = "Transcripts") if(is.null(features)){ features <- transcriptsBy(TxDb) features <- unlist(features) update_cache_item(item = ChIPseekerCache, list("Transcripts" = features)) } # if ( exists("Transcripts", envir=ChIPseekerEnv, inherits=FALSE) ) { # features <- get("Transcripts", envir=ChIPseekerEnv) # } else { # features <- transcriptsBy(TxDb) # features <- unlist(features) # assign("Transcripts", features, envir=ChIPseekerEnv) # } } return(features) } ##' get filenames of sample files ##' ##' ##' @title getSampleFiles ##' @return list of file names ##' @export ##' @author G Yu getSampleFiles <- function() { dir <- system.file("extdata", "GEO_sample_data", package="ChIPseeker") files <- list.files(dir) ## protein <- sub("GSM\\d+_", "", files) ## protein <- sub("_.+", "", protein) protein <- gsub(pattern='GSM\\d+_(\\w+_\\w+)_.*', replacement='\\1',files) protein <- sub("_Chip.+", "", protein) res <- paste(dir, files, sep="/") res <- as.list(res) names(res) <- protein return(res) } ## @importFrom RCurl getURL ## getDirListing <- function (url) { ## ## from GEOquery ## print(url) ## a <- getURL(url) ## b <- textConnection(a) ## d <- read.table(b, header = FALSE) ## close(b) ## return(d) ## } is.dir <- function(dir) { if (file.exists(dir) == FALSE) return(FALSE) return(file.info(dir)$isdir) } parse_targetPeak_Param <- function(targetPeak) { if (length(targetPeak) == 1) { if (is.dir(targetPeak)) { files <- list.files(path=targetPeak) idx <- unlist(sapply(c("bed", "bedGraph", "Peak"), grep, x=files)) idx <- sort(unique(idx)) files <- files[idx] targetPeak <- sub("/$", "", targetPeak) res <- paste(targetPeak, files, sep="/") } else { if (!file.exists(targetPeak)) { stop("bed file is not exists...") } else { res <- targetPeak } } } else { if (is.dir(targetPeak[1])) { stop("targetPeak should be a vector of bed file names or a folder containing bed files...") } else { res <- targetPeak[file.exists(targetPeak)] if (length(res) == 0) { stop("targetPeak file not exists...") } } } return(res) } IDType <- function(TxDb) { ## ## IDType <- metadata(TxDb)[8,2] ## ## update: 2015-10-27 ## now IDType change from metadata(TxDb)[8,2] to metadata(TxDb)[9,2] ## it may change in future too ## ## it's safe to extract via grep md <- metadata(TxDb) md[grep("Type of Gene ID", md[,1]), 2] } list_to_dataframe <- function(dataList) { if (is.null(names(dataList))) return(do.call('rbind', dataList)) cn <- lapply(dataList, colnames) %>% unlist %>% unique cn <- c('.id', cn) dataList2 <- lapply(seq_along(dataList), function(i) { data = dataList[[i]] data$.id = names(dataList)[i] idx <- ! cn %in% colnames(data) if (sum(idx) > 0) { for (i in cn[idx]) { data[, i] <- NA } } return(data[,cn]) }) res <- do.call('rbind', dataList2) res$.id <- factor(res$.id, levels=rev(names(dataList))) return(res) } ##' @importFrom GenomicRanges GRangesList ##' @export GenomicRanges::GRangesList ## . function was from plyr package ##' capture name of variable ##' ##' @rdname dotFun ##' @export ##' @title . ##' @param ... expression ##' @param .env environment ##' @return expression ##' @examples ##' x <- 1 ##' eval(.(x)[[1]]) . <- function (..., .env = parent.frame()) { structure(as.list(match.call()[-1]), env = .env, class = "quoted") } ##' check upstream and downstream parameter ##' ##' ##' check_upstream_and_downstream ##' ##' @param upstream upstream ##' @param downstream downstream ##' @importFrom ggplot2 rel check_upstream_and_downstream <- function(upstream, downstream){ ## upstream and downstream should be the same type if(class(upstream) != class(downstream)){ stop("the type of upstream and downstream should be the same...") } ## downstream and upstream parameter should be numeric or NULL if(!is.numeric(upstream) && !is.null(upstream)){ stop("upstream and downstream parameter should be numeric or NULL...") } ## the value of rel object should be in (0,1) if(inherits(upstream, 'rel')){ if(as.numeric(upstream) < 0 || as.numeric(upstream) >1 ){ stop('the value of rel object should be in (0,1)...') } } ## check actual number if(is.numeric(upstream) && !inherits(upstream, 'rel')){ if(upstream < 1 | downstream < 1){ stop('if upstream or downstream is integer, the value of it should be greater than 1...') } } } ##' @importFrom ggplot2 rel ##' ##' @export ggplot2::rel ##' make label for figures ##' @param by one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR', 'UTR' ##' @param type one of "start_site", "end_site", "body" make_label <- function(type, by){ if(type == 'body'){ if(by %in% c('gene', 'transcript', 'exon', 'intron')){ label_SS <- paste0("T","SS") label_TS <- paste0("T","TS") label <- c(label_SS,label_TS) }else{ label_SS <- paste0(by,"_SS") label_TS <- paste0(by,"_TS") label <- c(label_SS,label_TS) } }else if(type == "start_site"){ if(by %in% c('gene', 'transcript', 'exon', 'intron')){ label <- paste0("T","SS") }else{ label <- paste0(by,"_SS") } }else{ if(by %in% c('gene', 'transcript', 'exon', 'intron')){ label <- paste0("T","TS") }else{ label <- paste0(by,"_TS") } } return(label) } ##' @importFrom yulab.utils get_cache_item get_env_genome <- function(){ current_env <- get_cache_item(item = ChIPseekerCache) env_txdb <- current_env$TXDB env_txdb_meta <- S4Vectors::metadata(env_txdb) env_txdb_version <- env_txdb_meta[grep("Genome",env_txdb_meta[,1]),2] return(env_txdb_version) } ================================================ FILE: R/vennpie.R ================================================ ##' @importFrom plotrix floating.pie vennpie.csAnno <- function(x, r = 0.2, cex = 1.2, col = NULL) { detailGenomicAnnotation <- x@detailGenomicAnnotation distance <- as.data.frame(x)$distanceToTSS total <- nrow(detailGenomicAnnotation) Genic <- sum(detailGenomicAnnotation$genic) Intergenic <- total-Genic Distal_Intergenic <- sum(detailGenomicAnnotation$distal_intergenic) Intron <- sum(detailGenomicAnnotation$Intron) Exon <- sum(detailGenomicAnnotation$Exon) Upstream <- sum(detailGenomicAnnotation$Promoter & distance < 0) ## fiveUTR <- sum(detailGenomicAnnotation$fiveUTR) ## threeUTR <- sum(detailGenomicAnnotation$threeUTR) Downstream <- sum(detailGenomicAnnotation$downstream) ## fiveUTR='#e5f5e0',threeUTR='#a1d99b', cols <- c(NO='white', Genic='#3182bd', Intergenic='#fec44f', Intron='#fc9272', Exon='#9ecae1', Upstream='#ffeda0', Downstream='#fee0d2', Distal_Intergenic='#d95f0e') cols[names(col)] <- col ##par(mai = c(0,0,0,0)) ##layout(matrix(c(1,2), ncol=2), widths=c(0.7,0.3)) pie(1, radius=r, init.angle=90, col="white", border=NA, labels='') ## https://www.biostars.org/p/326456/ ## if count is 0, floating pie will ignore it ## and the color will mismatch with the category ## fixed by adding pseudo-count +1 floating.pie(0,0, c(Exon, Genic-Exon, Distal_Intergenic, Downstream, Intergenic-Distal_Intergenic-Downstream ) + 1, radius=4*r, startpos=pi/2, col=cols[c("Exon", "NO", "NO", "Downstream", "NO")], border=NA) floating.pie(0,0, c(Genic-Intron, Intron, Distal_Intergenic, Intergenic-Upstream-Distal_Intergenic, Upstream) +1 , radius=3*r, startpos=pi/2, col=cols[c("NO", "Intron", "Distal_Intergenic", "NO", "Upstream")], border=NA) floating.pie(0, 0, c(Genic, Intergenic) +1, radius=2*r, startpos=pi/2, col=cols[c("Genic", "Intergenic")], border=NA) ##plot.new() ##legend(center), legend=names(cols)[-1], fill=cols[-1], bty="n") legend(3*r, 3*r, legend=sub("_", " ", names(cols)[-1]), fill=cols[-1], bty="n", cex=cex) } ================================================ FILE: R/vennplot.R ================================================ ##' plot the overlap of a list of object ##' ##' ##' There are two ways to plot, which users can specify through `by`. ##' ##' The first way is to use `gplots` packages, by setting `by = gplots`. This method ##' is default method. The venn plot produced through this way has no color. ##' ##' The second way is to use `ggVennDiagram` packages, by setting `by = ggVennDiagram`. ##' The venn plot produced through this way has colors which can be defined by users using ##' ggplot2 grammar e.g.(scale_fill_distiller()). And users can specify any details, like digital number, ##' text size and showing percentage or not, by inputting `...` extra parameters. ##' ##' @title vennplot ##' @param Sets a list of object, can be vector or GRanges object ##' @param by one of gplots, ggVennDiagram or Vennerable ##' @param ... extra parameters using ggVennDiagram. Details see \link[ggVennDiagram]{ggVennDiagram} ##' @return venn plot that summarize the overlap of peaks ##' from different experiments or gene annotation from ##' different peak files. ##' @importFrom gplots plot.venn ## @importFrom ggVennDiagram ggVennDiagram ## @importFrom Vennerable Venn ## @importFrom grid grid.newpage ##' @examples ##' ## example not run ##' ## require(TxDb.Hsapiens.UCSC.hg19.knownGene) ##' ## txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene ##' ## peakfiles <- getSampleFiles() ##' ## peakAnnoList <- lapply(peakfiles, annotatePeak) ##' ## names(peakAnnoList) <- names(peakfiles) ##' ## genes= lapply(peakAnnoList, function(i) as.data.frame(i)$geneId) ##' ## vennplot(genes) ##' @export ##' @author G Yu vennplot <- function(Sets, by="gplots",...) { if (is.null(names(Sets))) { nn <- paste0("Set", seq_along(Sets)) warning("input is not a named list, set the name automatically to ", paste(nn, collapse = " ")) names(Sets) <- nn ## stop("input object should be a named list...") } overlapDF <- overlap(Sets) if (by == "Vennerable") { ## setRepositories(ind=7) ## install.package("Vennerable") ## OR ## install.packages("Vennerable", repos="http://R-Forge.R-project.org") pkg <- "Vennerable" require(pkg, character.only=TRUE) Venn <- eval(parse(text="Venn")) v <- Venn(SetNames=names(Sets), Weight=overlapDF$Weight) plotVenn <- eval(parse(text="Vennerable:::plotVenn")) plotVenn(v) } else if (by == "gplots") { n <- ncol(overlapDF) colnames(overlapDF)[n] <- "num" overlapDF <- overlapDF[, c(n, 1:(n-1))] rownames(overlapDF)=apply(overlapDF, 1, function(i) paste(i[-1], sep="", collapse="")) vennCount <- as.matrix(overlapDF) class(vennCount) <- "venn" plot.venn(vennCount) } else if(by == "ggVennDiagram"){ ggVennDiagram::ggVennDiagram(Sets, ...) } else { stop("not supported...") } } ##' vennplot for peak files ##' ##' ##' @title vennplot.peakfile ##' @param files peak files ##' @param labels labels for peak files ##' @return figure ##' @export ##' @author G Yu vennplot.peakfile <- function(files, labels=NULL) { peak.Sets <- lapply(files, readPeakFile) if (is.null(labels)) { ## remove .xls or .bed of the file names as labels labels <- sub("\\.\\w+$", "", files) } names(peak.Sets) <- labels vennplot(peak.Sets) } ================================================ FILE: R/zzz.R ================================================ ##' @importFrom yulab.utils yulab_msg .onAttach <- function(libname, pkgname) { packageStartupMessage(yulab_msg(pkgname)) options(ChIPseeker.downstreamDistance = 300) options(ChIPseeker.ignore_1st_exon = FALSE) options(ChIPseeker.ignore_1st_intron = FALSE) options(ChIPseeker.ignore_downstream = FALSE) options(ChIPseeker.ignore_promoter_subcategory= FALSE) options(aplot_align = 'y') } ================================================ FILE: README.Rmd ================================================ --- output: md_document: variant: gfm html_preview: false --- ```{r echo=FALSE, results="hide", message=FALSE} #library("txtplot") library("badger") library("ypages") library("yulab.utils") ``` # ChIPseeker: ChIP peak Annotation, Comparison, and Visualization `r badge_bioc_release("ChIPseeker", "green")` `r badge_devel("guangchuangyu/ChIPseeker", "green")` [![Bioc](http://www.bioconductor.org/shields/years-in-bioc/ChIPseeker.svg)](https://www.bioconductor.org/packages/devel/bioc/html/ChIPseeker.html#since) [![Say Thanks!](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://saythanks.io/to/GuangchuangYu) [![Project Status: Active - The project has reached a stable, usable state and is being actively developed.](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active) [![codecov](https://codecov.io/gh/GuangchuangYu/ChIPseeker/branch/master/graph/badge.svg)](https://codecov.io/gh/GuangchuangYu/ChIPseeker/) [![Last-changedate](https://img.shields.io/badge/last%20change-`r gsub('-', '--', Sys.Date())`-green.svg)](https://github.com/GuangchuangYu/ChIPseeker/commits/master) [![GitHub forks](https://img.shields.io/github/forks/GuangchuangYu/ChIPseeker.svg)](https://github.com/GuangchuangYu/ChIPseeker/network) [![GitHub stars](https://img.shields.io/github/stars/GuangchuangYu/ChIPseeker.svg)](https://github.com/GuangchuangYu/ChIPseeker/stargazers) [![platform](http://www.bioconductor.org/shields/availability/devel/ChIPseeker.svg)](https://www.bioconductor.org/packages/devel/bioc/html/ChIPseeker.html#archives) [![Build Status](http://www.bioconductor.org/shields/build/devel/bioc/ChIPseeker.svg)](https://bioconductor.org/checkResults/devel/bioc-LATEST/ChIPseeker/) [![Linux/Mac Travis Build Status](https://img.shields.io/travis/GuangchuangYu/ChIPseeker/master.svg?label=Mac%20OSX%20%26%20Linux)](https://travis-ci.org/GuangchuangYu/ChIPseeker) [![AppVeyor Build Status](https://img.shields.io/appveyor/ci/Guangchuangyu/ChIPseeker/master.svg?label=Windows)](https://ci.appveyor.com/project/GuangchuangYu/ChIPseeker) ```{r comment="", echo=FALSE, results='asis'} cat(packageDescription('ChIPseeker')$Description) ``` ## :writing_hand: Authors Guangchuang YU School of Basic Medical Sciences, Southern Medical University If you use `r Biocpkg('ChIPseeker')` in published research, please cite: + Q Wang#, M Li#, T Wu, L Zhan, L Li, M Chen, W Xie, Z Xie, E Hu, S Xu, __G Yu__\*. [Exploring epigenomic datasets by ChIPseeker](https://onlinelibrary.wiley.com/share/author/GYJGUBYCTRMYJFN2JFZZ?target=10.1002/cpz1.585). __*Current Protocols*__, 2022, 2(10): e585. + __G Yu__\*, LG Wang, QY He\*. [ChIPseeker: an R/Bioconductor package for ChIP peak annotation, comparision and visualization](http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btv145). __*Bioinformatics*__. 2015, 31(14):2382-2383. ## :arrow_double_down: Installation Get the released version from Bioconductor: ```r ## try http:// if https:// URLs are not supported if (!requireNamespace("BiocManager", quietly=TRUE)) install.packages("BiocManager") ## BiocManager::install("BiocUpgrade") ## you may need this BiocManager::install("ChIPseeker") ``` Or the development version from github: ```r ## install.packages("devtools") devtools::install_github("YuLab-SMU/ChIPseeker") ``` ## Contributing We welcome any contributions! By participating in this project you agree to abide by the terms outlined in the [Contributor Code of Conduct](CONDUCT.md). ================================================ FILE: README.md ================================================ # ChIPseeker: ChIP peak Annotation, Comparison, and Visualization [![](https://img.shields.io/badge/release%20version-1.32.1-green.svg)](https://www.bioconductor.org/packages/ChIPseeker) [![](https://img.shields.io/badge/devel%20version-1.33.4-green.svg)](https://github.com/guangchuangyu/ChIPseeker) [![Bioc](http://www.bioconductor.org/shields/years-in-bioc/ChIPseeker.svg)](https://www.bioconductor.org/packages/devel/bioc/html/ChIPseeker.html#since) [![Say Thanks!](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://saythanks.io/to/GuangchuangYu) [![Project Status: Active - The project has reached a stable, usable state and is being actively developed.](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active) [![codecov](https://codecov.io/gh/GuangchuangYu/ChIPseeker/branch/master/graph/badge.svg)](https://codecov.io/gh/GuangchuangYu/ChIPseeker/) [![Last-changedate](https://img.shields.io/badge/last%20change-2022--10--29-green.svg)](https://github.com/GuangchuangYu/ChIPseeker/commits/master) [![GitHub forks](https://img.shields.io/github/forks/GuangchuangYu/ChIPseeker.svg)](https://github.com/GuangchuangYu/ChIPseeker/network) [![GitHub stars](https://img.shields.io/github/stars/GuangchuangYu/ChIPseeker.svg)](https://github.com/GuangchuangYu/ChIPseeker/stargazers) [![platform](http://www.bioconductor.org/shields/availability/devel/ChIPseeker.svg)](https://www.bioconductor.org/packages/devel/bioc/html/ChIPseeker.html#archives) [![Build Status](http://www.bioconductor.org/shields/build/devel/bioc/ChIPseeker.svg)](https://bioconductor.org/checkResults/devel/bioc-LATEST/ChIPseeker/) [![Linux/Mac Travis Build Status](https://img.shields.io/travis/GuangchuangYu/ChIPseeker/master.svg?label=Mac%20OSX%20%26%20Linux)](https://travis-ci.org/GuangchuangYu/ChIPseeker) [![AppVeyor Build Status](https://img.shields.io/appveyor/ci/Guangchuangyu/ChIPseeker/master.svg?label=Windows)](https://ci.appveyor.com/project/GuangchuangYu/ChIPseeker) This package implements functions to retrieve the nearest genes around the peak, annotate genomic region of the peak, statstical methods for estimate the significance of overlap among ChIP peak data sets, and incorporate GEO database for user to compare the own dataset with those deposited in database. The comparison can be used to infer cooperative regulation and thus can be used to generate hypotheses. Several visualization functions are implemented to summarize the coverage of the peak experiment, average profile and heatmap of peaks binding to TSS regions, genomic annotation, distance to TSS, and overlap of peaks or genes. ## :writing_hand: Authors Guangchuang YU School of Basic Medical Sciences, Southern Medical University If you use [ChIPseeker](http://bioconductor.org/packages/ChIPseeker) in published research, please cite: - Q Wang\#, M Li\#, T Wu, L Zhan, L Li, M Chen, W Xie, Z Xie, E Hu, S Xu, **G Yu**\*. [Exploring epigenomic datasets by ChIPseeker](https://onlinelibrary.wiley.com/share/author/GYJGUBYCTRMYJFN2JFZZ?target=10.1002/cpz1.585). ***Current Protocols***, 2022, 2(10): e585. - **G Yu**\*, LG Wang, QY He\*. [ChIPseeker: an R/Bioconductor package for ChIP peak annotation, comparision and visualization](http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btv145). ***Bioinformatics***. 2015, 31(14):2382-2383. ## :arrow_double_down: Installation Get the released version from Bioconductor: ``` r ## try http:// if https:// URLs are not supported if (!requireNamespace("BiocManager", quietly=TRUE)) install.packages("BiocManager") ## BiocManager::install("BiocUpgrade") ## you may need this BiocManager::install("ChIPseeker") ``` Or the development version from github: ``` r ## install.packages("devtools") devtools::install_github("YuLab-SMU/ChIPseeker") ``` ## Contributing We welcome any contributions! By participating in this project you agree to abide by the terms outlined in the [Contributor Code of Conduct](CONDUCT.md). ================================================ FILE: appveyor.yml ================================================ environment: matrix: - R_VERSION: devel R_ARCH: x64 USE_RTOOLS: true _R_CHECK_FORCE_SUGGESTS_: false # DO NOT CHANGE the "init" and "install" sections below # Download script file from GitHub init: ps: | $ErrorActionPreference = "Stop" Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" Import-Module '..\appveyor-tool.ps1' install: ps: Bootstrap # Adapt as necessary starting from here build_script: - travis-tool.sh install_bioc BiocStyle IRanges graphite ReactomePA AnnotationDbi DO.db DOSE org.Hs.eg.db TxDb.Hsapiens.UCSC.hg19.knownGene clusterProfiler - travis-tool.sh install_deps test_script: - travis-tool.sh run_tests on_failure: - 7z a failure.zip *.Rcheck\* - appveyor PushArtifact failure.zip artifacts: - path: '*.Rcheck\**\*.log' name: Logs - path: '*.Rcheck\**\*.out' name: Logs - path: '*.Rcheck\**\*.fail' name: Logs - path: '*.Rcheck\**\*.Rout' name: Logs - path: '\*_*.tar.gz' name: Linux Package - path: '\*_*.zip' name: Windows Package notifications: - provider: Email to: - gcyu@connect.hku.hk on_build_success: false ================================================ FILE: inst/CITATION ================================================ citHeader("Please cite Q. Wang (2022) or G. Yu (2015) for using ChIPseeker. In addition, please cite clusterProfiler/DOSE/ReactomePA when using functional enrichment analyses.") citEntry(entry ="ARTICLE", title = "Exploring epigenomic datasets by ChIPseeker", author = c( person("Qianwen", "Wang"), person("Ming", "Li"), person("Tianzhi", "Wu"), person("Li", "Zhan"), person("Lin", "Li"), person("Meijun", "Chen"), person("Wenqin", "Xie"), person("Zijing", "Xie"), person("Erqiang", "Hu"), person("Shuangbin", "Xu"), person("Guangchuang", "Yu", email = "guangchuangyu@gmail.com") ), journal = "Current Protocols", year = "2022", volume = "2", number = "10", pages = "e585", PMID = "36286622", doi = "10.1002/cpz1.585", url = "https://onlinelibrary.wiley.com/share/author/GYJGUBYCTRMYJFN2JFZZ?target=10.1002/cpz1.585", textVersion = paste("Qianwen Wang, Ming Li, Tianzhi Wu, Li Zhan, Lin Li, Meijun Chen, Wenqin Xie, Zijing Xie, Erqiang Hu, Shuangbin Xu, Guangchuang Yu.", "Exploring epigenomic datasets by ChIPseeker.", "Current Protocols 2022, 2(10): e585") ) citEntry(entry ="ARTICLE", title = "ChIPseeker: an R/Bioconductor package for ChIP peak annotation, comparison and visualization", author = personList( as.person("Guangchuang Yu"), as.person("Li-Gen Wang"), as.person("Qing-Yu He") ), journal = "Bioinformatics", year = "2015", volume = "31", number = "14", pages = "2382-2383", PMID = "25765347", doi = "10.1093/bioinformatics/btv145", textVersion = paste("Guangchuang Yu, LiGen Wang, and QingYu He.", "ChIPseeker: an R/Bioconductor package for ChIP peak annotation, comparison and visualization.", "Bioinformatics 2015, 31(14):23822383") ) ================================================ FILE: inst/extdata/sample_peaks.txt ================================================ chr start end length summit tags X.10.log10.pvalue. fold_enrichment FDR... chr10 105137980 105138593 614 174 7 52.8 15.32 NA chr10 42644416 42645383 968 669 27 77.15 9.13 0.79 chr6 162188189 162188742 554 174 8 59.88 17.82 NA chr4 2307246 2307622 377 188 9 85.25 26.62 1.03 chr13 51791808 51792240 433 267 8 65.32 15.08 0.74 chr19 58731678 58732653 976 472 15 55 8.04 NA chr16 47717334 47717718 385 193 8 75.05 18.38 NA chr5 133997712 133998281 570 216 11 59.14 10.48 NA chr17 21730549 21731638 1090 685 15 64.79 9.19 NA chr1 108313793 108314347 555 189 7 58.81 18.16 0.78 chr7 66713702 66714183 482 285 7 50.52 14.54 0.97 chr6 102911566 102912390 825 476 17 53.89 6.13 NA chr7 99693575 99694953 1379 285 19 54.14 6.93 0.92 chr14 21930630 21931094 465 289 7 57.91 11.83 0.78 chr7 140460467 140461015 549 221 10 51.62 9.66 0.98 chr1 204827779 204828392 614 236 10 54.94 11.73 0.85 chr22 31055016 31055701 686 486 9 68.84 24.51 NA chr12 121813018 121813753 736 507 12 63.05 13.41 0.77 chr9 34128226 34128829 604 266 9 62.93 15.08 0.77 chr8 51449160 51449457 298 150 7 63.71 9.59 NA chr7 152266697 152267502 806 218 13 66.03 10.72 NA chr7 5200045 5200648 604 217 10 51.25 9.72 NA chr20 32330916 32331273 358 179 7 51.33 14.66 0.98 chr13 53041893 53042341 449 256 7 65.63 21.79 0.74 chr4 188271660 188272266 607 340 10 51.25 11.16 NA chr11 57336216 57336780 565 167 8 55.92 15.08 0.85 chr7 147992515 147993057 543 199 8 70.42 18.16 0.7 chr9 126707435 126708020 586 198 10 89.63 28.73 1.04 chr8 146099083 146099655 573 205 8 65.26 18.38 NA chr3 18887631 18888098 468 210 10 80.46 19.36 NA chr15 75296138 75296986 849 382 14 62.33 7.8 NA chr7 76678195 76678851 657 217 12 68.53 10.06 0.62 chr10 72271427 72271869 443 225 8 50.24 14.25 NA chr3 179584381 179584987 607 177 11 50.71 9.19 NA chr4 163405047 163405425 379 189 7 64.2 22 0.77 chr7 72657437 72658247 811 241 19 81.36 8.15 0.91 chr17 47813522 47813967 446 184 7 50.24 12.3 NA chr19 17019713 17020403 691 501 10 51.25 11.12 NA chr2 112229859 112230262 404 203 8 56.88 14.85 NA chr10 69646478 69646970 493 313 7 56.06 17.74 0.84 chr22 21699160 21699710 551 365 7 58.69 17.96 0.79 chrX 119427920 119428427 508 197 7 50.24 14.01 NA chr11 12358590 12358928 339 169 7 53.91 11.93 0.91 chr10 23293727 23294379 653 446 8 50.61 9.19 NA chr16 19008911 19009452 542 178 8 53.74 10.45 NA chr11 71999828 72000241 414 242 8 79.93 25.14 0.83 chr17 38168330 38169125 796 363 11 57.63 15.91 NA chr7 56124436 56124960 525 314 11 88.99 20.68 NA chr22 26998729 26999306 578 216 10 72.04 17.51 NA chr11 108369506 108370050 545 349 7 50.52 13.7 0.97 chr1 247012833 247013232 400 200 9 69.75 18.1 0.61 chr4 57030170 57030662 493 181 7 50.52 14.87 0.97 chr2 85878694 85879282 589 288 7 54.39 16.76 0.86 chr8 57064347 57064991 645 474 10 80.87 22.7 0.83 chr18 60283518 60284080 563 167 10 51.62 9.91 0.98 chr14 19009999 19010720 722 547 118 90.28 6.43 1 chr4 123592232 123592745 514 168 8 59.23 15.08 0.79 chr17 6492901 6493361 461 211 7 50.24 10.59 NA chr1 201432452 201432945 494 338 8 60.21 12.42 0.74 chr5 25353549 25354499 951 468 23 91.86 10.49 NA chr21 41376711 41377162 452 162 8 60.21 13.63 0.74 chr10 135193549 135194110 562 159 8 51.28 8.74 0.98 chr2 165659676 165660060 385 192 10 106.7 36.32 1.94 chr1 18733754 18734314 561 405 7 50.52 14.1 0.97 chr3 181501405 181501917 513 333 11 91.78 18.05 1.01 chr15 44748091 44748538 448 246 9 74.71 17.6 0.72 chr17 64850164 64850608 445 267 8 50.18 11.73 0.97 chr3 50293565 50294504 940 683 16 95.98 16.76 1.24 chr1 6780373 6780997 625 429 11 70.72 16.09 0.68 chr11 60722310 60723055 746 403 14 55.49 6.89 NA chr16 47690030 47690397 368 185 7 84.74 28.08 NA chr3 42881399 42881811 413 207 8 64.41 18.38 NA chr17 73738304 73739104 801 267 11 54.12 13.13 NA chr7 72886499 72887134 636 426 14 55.22 8.77 NA chr19 479390 480356 967 608 15 62.85 9.19 NA chr1 223554052 223554581 530 360 7 50.24 12.18 NA chr19 55642648 55643120 473 316 7 55.56 11.17 0.86 chr2 160150507 160150819 313 156 10 55.46 7.87 0.85 chr6 108379579 108380048 470 218 12 135.66 40.11 NA chr7 66659861 66660350 490 334 7 60.23 20.11 0.79 chr21 45158341 45159044 704 246 12 57.42 9.19 NA chr17 12142123 12142576 454 256 10 95.08 28.28 1.31 chr12 111535820 111536662 843 508 13 55.22 10.33 NA chr4 74053898 74054391 494 220 7 50.24 13.62 NA chr17 41107403 41107930 528 362 12 82.37 13.13 NA chr5 37022195 37022565 371 186 9 90.79 22.98 NA chr14 89037667 89038455 789 465 11 91.33 25.38 NA chr19 55988686 55989638 953 457 17 103.53 17.24 NA chrX 73187260 73187705 446 145 7 65.84 18.16 0.71 chr16 68160728 68161185 458 216 7 70.84 24.07 NA chr1 72383088 72383669 582 167 8 54.89 12.57 0.85 chr3 65290871 65291616 746 399 12 101.35 21.79 1.55 chr17 4396192 4396640 449 265 7 57.21 16.76 0.81 chr7 129241408 129242063 656 201 10 66.64 15.32 NA chr8 105375364 105376057 694 485 11 92.72 22.98 NA chr8 48921895 48922221 327 164 7 60.45 11.09 NA chr22 47339473 47339954 482 276 8 59.45 15.32 NA chr7 5545814 5546438 625 216 11 52.8 12.42 NA chr13 77761699 77762182 484 158 7 63.2 18.16 0.78 chr9 6582021 6582849 829 425 15 106.11 17.6 1.87 chr5 1430193 1430994 802 506 13 51.97 8.04 NA chr11 64532961 64533474 514 199 7 50.52 12.92 0.95 chr21 42055751 42056309 559 218 7 50.24 15.42 NA chr12 107211163 107211704 542 359 10 80.87 19.07 0.83 chr3 161238645 161239047 403 201 7 60.41 19.55 0.8 chr17 3680122 3680663 542 281 14 67.79 10.11 NA chr17 20455343 20455801 459 191 10 51.62 9.23 0.97 chr2 196378101 196378811 711 519 11 59.14 14.71 NA chr3 194245019 194245449 431 268 7 55.17 12.57 0.84 chr16 14626541 14627285 745 509 11 54.27 10.72 NA chr2 236773772 236774222 451 251 8 59.88 14.51 NA chr11 67551724 67552171 448 168 7 50.24 12.35 NA chr10 54421011 54421536 526 249 7 60.53 14.53 0.8 chr11 121028043 121028655 613 198 10 51.25 11.27 NA chr10 27600469 27601208 740 354 12 82.79 17.6 0.92 chr12 83275694 83276252 559 296 8 69.37 21.79 0.65 chr14 63851416 63851824 409 209 9 70.34 16.45 0.64 chr4 185478067 185478514 448 299 7 58.8 16.22 NA chr14 51818623 51819136 514 341 7 50.52 12.92 0.95 chr3 179394999 179395365 367 183 7 68.9 25.42 0.61 chr6 90496197 90496532 336 168 7 53.32 11.73 0.94 chr3 48664669 48665218 550 301 8 50.25 11.49 NA chr16 17028641 17029305 665 407 12 107.4 29.05 2.05 chr14 101874985 101876019 1035 571 13 74.1 13.79 NA chr12 42519772 42520228 457 260 7 53.33 15.08 0.96 chr9 135276277 135277082 806 298 12 73.73 16 0.72 chr14 104160631 104161383 753 182 11 60.5 11.6 0.8 chr3 85070344 85071044 701 413 8 59.88 19.33 NA chr11 107900936 107901419 484 287 7 51.54 12.57 0.97 chr12 130291640 130292042 403 201 7 50.3 14.18 0.97 chr21 41159981 41160669 689 191 9 59.49 18.79 100 chr6 118954241 118954912 672 336 10 80.87 23.65 0.83 chr10 70228566 70229168 603 226 10 72.72 17.6 0.69 chr22 46914023 46914630 608 424 9 51.27 11.17 0.98 chr14 61232829 61233495 667 481 10 80.46 21.46 NA chr4 74134806 74135475 670 418 9 58.57 13.79 NA chr10 7869332 7869993 662 176 10 78.36 15.71 0.76 chr5 120189408 120189950 543 325 8 56.98 16.09 NA chr1 11542794 11543378 585 312 8 59.88 16.13 NA chr1 9413700 9414272 573 159 8 55.43 12.57 0.85 chr12 131218246 131219043 798 602 12 58.77 10.72 NA chr1 29032216 29033010 795 333 11 59.42 14.71 NA chr2 187827920 187828577 658 291 11 78.27 20.68 NA chr11 28235164 28235576 413 214 10 71.56 15.08 0.64 chr4 173579304 173579706 403 201 9 78.66 22.62 0.77 chr12 80194469 80194868 400 200 8 81.06 29.05 0.84 chr11 26134912 26135581 670 472 12 103.04 23.58 1.57 chr9 88771969 88772469 501 371 8 50.17 13.79 NA chr8 146115421 146115783 363 182 7 65.63 22.99 NA chr6 140093330 140093770 441 169 7 54.45 15.08 0.86 ================================================ FILE: inst/test-plot/test-plotPeakProf.R ================================================ library(ChIPseeker) library(TxDb.Hsapiens.UCSC.hg19.knownGene) context("test plotPeakProf() for a list of windows") peak <- getSampleFiles()[[4]] peak_list <- getSampleFiles()[4:5] txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene ## self-made enhancer region in the form of granges object enhancer <- transcripts(txdb)[1:5000,] ## self-made non-enhancer region in the form of granges object non_enhancer <- unlist(fiveUTRsByTranscript(txdb))[1:5000] gr <- list(enhancer,non_enhancer) test_that("input two self-made granges object",{ p <- plotPeakProf(peak = peak, conf = 0.95, by = c("enhancer","non-enhancer"), windows_name = c("enhancer","non-enhancer"), weightCol = "V5", type = "start_site", upstream = 1000, downstream = 1000, TxDb = list(enhancer,non_enhancer)) expect_is(p,"gg") }) test_that("input a list of peaks",{ p <- plotPeakProf(peak = peak_list, TxDb = list(enhancer,non_enhancer), conf = 0.95, by = c("enhancer","non-enhancer"), windows_name = c("enhancer","non-enhancer"), weightCol = "V5", type = "start_site", upstream = 1000, downstream = 1000) expect_is(p,"gg") }) test_that("input gr and txdb input",{ p <- plotPeakProf(peak = peak, TxDb = list(enhancer,txdb), conf = 0.95, by = c("enhancer","gene"), windows_name = c("enhancer","gene"), weightCol = "V5", type = "start_site", upstream = 1000, downstream = 1000) expect_is(p,"gg") }) test_that("check body region",{ p <- plotPeakProf(peak = peak, TxDb = list(enhancer,txdb), conf = 0.95, by = c("enhancer","gene"), windows_name = c("enhancer","gene"), weightCol = "V5", type = "body", upstream = 1000, downstream = 1000, nbin = 800) expect_is(p,"gg") p <- plotPeakProf(peak = peak, TxDb = list(enhancer,txdb), conf = 0.95, by = c("enhancer","gene"), windows_name = c("enhancer","gene"), weightCol = "V5", type = "body", nbin = 800) expect_is(p,"gg") p <- plotPeakProf(peak = peak, TxDb = list(enhancer,txdb), conf = 0.95, by = c("enhancer","gene"), windows_name = c("enhancer","gene"), weightCol = "V5", type = "body", upstream = rel(0.2), downstream = rel(0.2), nbin = 800) expect_is(p,"gg") p <- plotPeakProf(peak = peak_list, TxDb = list(enhancer,txdb), conf = 0.95, by = c("enhancer","gene"), windows_name = c("enhancer","gene"), weightCol = "V5", type = "body", upstream = rel(0.2), downstream = rel(0.2), nbin = 800) expect_is(p,"gg") }) ================================================ FILE: inst/test-plot/test-plotTagMatrix.R ================================================ library(ChIPseeker) library(TxDb.Hsapiens.UCSC.hg19.knownGene) context("test plotTagMatrix() and related functions") test_that("test plotPeakProf2 function use txdb",{ peak <- getSampleFiles()[[4]] peak_list <- getSampleFiles()[4:5] txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene # test single peak file p1_1 <- plotPeakProf(peak = peak, upstream = 1000, downstream = 1000, by = "gene", type = "start_site", TxDb = txdb, nbin = 800) expect_is(p1_1, "gg") # test a list of peak files p1_2 <- plotPeakProf(peak = peak_list, upstream = 1000, downstream = 1000, by = "gene", type = "start_site", TxDb = txdb, nbin = 800) expect_is(p1_2, "gg") # test body region # without extension p2_1 <- plotPeakProf(peak = peak_list, by = "gene", type = "body", TxDb = txdb, nbin = 800) expect_is(p2_1, "gg") # extend with rel object p2_2 <- plotPeakProf(peak = peak_list, by = "gene", type = "body", TxDb = txdb, upstream = rel(0.2), downstream = rel(0.2), nbin = 800) expect_is(p2_2, "gg") # extend with actual number p2_3 <- plotPeakProf(peak = peak_list, by = "gene", type = "body", TxDb = txdb, upstream = 1000, downstream = 1000, nbin = 800) expect_is(p2_3, "gg") }) test_that("test plotPeakProf2 function use self-made granges",{ peak <- getSampleFiles()[[4]] peak_list <- getSampleFiles()[4:5] txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene # we consider transcript region as enhancer region # and make self-made granges object # they can be the same in the form of granges object txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene enhancer <- transcripts(txdb)[1:5000,] # test single peak file p1_1 <- plotPeakProf(peak = peak, upstream = 1000, downstream = 1000, by = "gene", type = "start_site", TxDb = enhancer, nbin = 800) expect_is(p1_1, "gg") # test a list of peak files p1_2 <- plotPeakProf(peak = peak_list, upstream = 1000, downstream = 1000, by = "gene", type = "start_site", TxDb = enhancer, nbin = 800) expect_is(p1_2, "gg") # test body region # without extension p2_1 <- plotPeakProf(peak = peak_list, by = "gene", type = "body", TxDb = enhancer, nbin = 800) expect_is(p2_1, "gg") # extend with rel object p2_2 <- plotPeakProf(peak = peak_list, by = "gene", type = "body", TxDb = enhancer, upstream = rel(0.2), downstream = rel(0.2), nbin = 800) expect_is(p2_2, "gg") # extend with actual number p2_3 <- plotPeakProf(peak = peak_list, by = "gene", type = "body", TxDb = enhancer, upstream = 1000, downstream = 1000, nbin = 800) expect_is(p2_3, "gg") }) ================================================ FILE: man/ChIPseeker-package.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ChIPseeker-package.R \docType{package} \name{ChIPseeker-package} \alias{ChIPseeker} \alias{ChIPseeker-package} \title{ChIPseeker: ChIPseeker for ChIP peak Annotation, Comparison, and Visualization} \description{ This package implements functions to retrieve the nearest genes around the peak, annotate genomic region of the peak, statstical methods for estimate the significance of overlap among ChIP peak data sets, and incorporate GEO database for user to compare the own dataset with those deposited in database. The comparison can be used to infer cooperative regulation and thus can be used to generate hypotheses. Several visualization functions are implemented to summarize the coverage of the peak experiment, average profile and heatmap of peaks binding to TSS regions, genomic annotation, distance to TSS, and overlap of peaks or genes. } \seealso{ Useful links: \itemize{ \item \url{https://yulab-smu.top/contribution-knowledge-mining/} \item Report bugs at \url{https://github.com/YuLab-SMU/ChIPseeker/issues} } } \author{ \strong{Maintainer}: Guangchuang Yu \email{guangchuangyu@gmail.com} (\href{https://orcid.org/0000-0002-6485-8781}{ORCID}) Other contributors: \itemize{ \item Ming Li \email{limiang929@gmail.com} [contributor] \item Qianwen Wang \email{treywea@gmail.com} [contributor] \item Yun Yan \email{youryanyun@gmail.com} [contributor] \item Hervé Pagès \email{hpages.on.github@gmail.com} [contributor] \item Michael Kluge \email{michael.kluge@bio.ifi.lmu.de} [contributor] \item Thomas Schwarzl \email{schwarzl@embl.de} [contributor] \item Zhougeng Xu \email{xuzhougeng@163.com} [contributor] \item Chun-Hui Gao \email{gaospecial@gmail.com} (\href{https://orcid.org/0000-0002-1445-7939}{ORCID}) [contributor] } } \keyword{internal} ================================================ FILE: man/ChIPseekerCache.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ChIPseeker-package.R \docType{data} \name{ChIPseekerCache} \alias{ChIPseekerCache} \title{Name of the ChIPseeker cache environment (internal static variable)} \format{ character vector } \usage{ ChIPseekerCache } \description{ Name of the ChIPseeker cache environment (internal static variable) } \keyword{datasets} ================================================ FILE: man/annotatePeak.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/annotatePeak.R \name{annotatePeak} \alias{annotatePeak} \title{annotatePeak} \usage{ annotatePeak( peak, tssRegion = c(-3000, 3000), TxDb = NULL, level = "transcript", assignGenomicAnnotation = TRUE, genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic"), annoDb = NULL, addFlankGeneInfo = FALSE, flankDistance = 5000, sameStrand = FALSE, ignoreOverlap = FALSE, ignoreUpstream = FALSE, ignoreDownstream = FALSE, overlap = "TSS", verbose = TRUE, columns = c("ENTREZID", "ENSEMBL", "SYMBOL", "GENENAME") ) } \arguments{ \item{peak}{peak file or GRanges object} \item{tssRegion}{Region Range of TSS} \item{TxDb}{TxDb or EnsDb annotation object} \item{level}{one of transcript and gene} \item{assignGenomicAnnotation}{logical, assign peak genomic annotation or not} \item{genomicAnnotationPriority}{genomic annotation priority} \item{annoDb}{annotation package} \item{addFlankGeneInfo}{logical, add flanking gene information from the peaks} \item{flankDistance}{distance of flanking sequence} \item{sameStrand}{logical, whether find nearest/overlap gene in the same strand} \item{ignoreOverlap}{logical, whether ignore overlap of TSS with peak} \item{ignoreUpstream}{logical, if True only annotate gene at the 3' of the peak.} \item{ignoreDownstream}{logical, if True only annotate gene at the 5' of the peak.} \item{overlap}{one of 'TSS' or 'all', if overlap="all", then gene overlap with peak will be reported as nearest gene, no matter the overlap is at TSS region or not.} \item{verbose}{print message or not} \item{columns}{names of columns to be obtained from database} } \value{ data.frame or GRanges object with columns of: all columns provided by input. annotation: genomic feature of the peak, for instance if the peak is located in 5'UTR, it will annotated by 5'UTR. Possible annotation is Promoter-TSS, Exon, 5' UTR, 3' UTR, Intron, and Intergenic. geneChr: Chromosome of the nearest gene geneStart: gene start geneEnd: gene end geneLength: gene length geneStrand: gene strand geneId: entrezgene ID distanceToTSS: distance from peak to gene TSS if annoDb is provided, extra column will be included: ENSEMBL: ensembl ID of the nearest gene SYMBOL: gene symbol GENENAME: full gene name } \description{ Annotate peaks } \examples{ \dontrun{ require(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene peakfile <- system.file("extdata", "sample_peaks.txt", package="ChIPseeker") peakAnno <- annotatePeak(peakfile, tssRegion=c(-3000, 3000), TxDb=txdb) peakAnno } } \seealso{ \code{\link{plotAnnoBar}} \code{\link{plotAnnoPie}} \code{\link{plotDistToTSS}} } \author{ G Yu } ================================================ FILE: man/as.GRanges.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/csAnno.R \name{as.GRanges} \alias{as.GRanges} \title{as.GRanges} \usage{ as.GRanges(x) } \arguments{ \item{x}{csAnno object} } \value{ GRanges object } \description{ convert csAnno object to GRanges } \author{ Guangchuang Yu \url{https://guangchuangyu.github.io} } ================================================ FILE: man/as.data.frame.csAnno.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/csAnno.R \name{as.data.frame.csAnno} \alias{as.data.frame.csAnno} \title{as.data.frame.csAnno} \usage{ \method{as.data.frame}{csAnno}(x, row.names = NULL, optional = FALSE, ...) } \arguments{ \item{x}{csAnno object} \item{row.names}{row names} \item{optional}{should be omitted.} \item{...}{additional parameters} } \value{ data.frame } \description{ convert csAnno object to data.frame } \author{ Guangchuang Yu \url{https://guangchuangyu.github.io} } ================================================ FILE: man/check_upstream_and_downstream.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utilities.R \name{check_upstream_and_downstream} \alias{check_upstream_and_downstream} \title{check upstream and downstream parameter} \usage{ check_upstream_and_downstream(upstream, downstream) } \arguments{ \item{upstream}{upstream} \item{downstream}{downstream} } \description{ check_upstream_and_downstream } ================================================ FILE: man/combine_csAnno.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/csAnno.R \name{combine_csAnno} \alias{combine_csAnno} \title{combine_csAnno} \usage{ combine_csAnno(x, ...) } \arguments{ \item{x}{csAnno object} \item{...}{csAnno objects} } \value{ csAnno object } \description{ Combine csAnno Object } \details{ https://github.com/YuLab-SMU/ChIPseeker/issues/157 } ================================================ FILE: man/covplot.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/covplot.R \name{covplot} \alias{covplot} \title{covplot} \usage{ covplot( peak, weightCol = NULL, xlab = "Chromosome Size (bp)", ylab = "", title = "ChIP Peaks over Chromosomes", chrs = NULL, xlim = NULL, lower = 1, fill_color = "black" ) } \arguments{ \item{peak}{peak file or GRanges object} \item{weightCol}{weight column of peak} \item{xlab}{xlab} \item{ylab}{ylab} \item{title}{title} \item{chrs}{selected chromosomes to plot, all chromosomes by default} \item{xlim}{ranges to plot, default is whole chromosome} \item{lower}{lower cutoff of coverage signal} \item{fill_color}{specify the color/palette for the plot. Order matters} } \value{ ggplot2 object } \description{ plot peak coverage } \author{ G Yu } ================================================ FILE: man/csAnno-class.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/csAnno.R \docType{class} \name{csAnno-class} \alias{csAnno-class} \alias{show,csAnno-method} \alias{vennpie,csAnno-method} \alias{plotDistToTSS,csAnno-method} \alias{plotAnnoBar,csAnno-method} \alias{plotAnnoPie,csAnno-method} \alias{upsetplot,csAnno-method} \alias{subset,csAnno-method} \title{Class "csAnno" This class represents the output of ChIPseeker Annotation} \description{ Class "csAnno" This class represents the output of ChIPseeker Annotation } \section{Slots}{ \describe{ \item{\code{anno}}{annotation} \item{\code{tssRegion}}{TSS region} \item{\code{level}}{transcript or gene} \item{\code{hasGenomicAnnotation}}{logical} \item{\code{detailGenomicAnnotation}}{Genomic Annotation in detail} \item{\code{annoStat}}{annotation statistics} \item{\code{peakNum}}{number of peaks} }} \seealso{ \code{\link{annotatePeak}} } \author{ Guangchuang Yu \url{https://guangchuangyu.github.io} } \keyword{classes} ================================================ FILE: man/dot-ChIPseekerEnv.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utilities.R \name{.ChIPseekerEnv} \alias{.ChIPseekerEnv} \title{env function for ChIPseeker} \usage{ .ChIPseekerEnv(TxDb, item = "ChIPseekerEnv", force = FALSE) } \arguments{ \item{TxDb}{txdb object} \item{item}{item name} \item{force}{force to update txdb item in cache or not.} } \description{ env function for ChIPseeker } ================================================ FILE: man/dotFun.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utilities.R \name{.} \alias{.} \title{.} \usage{ .(..., .env = parent.frame()) } \arguments{ \item{...}{expression} \item{.env}{environment} } \value{ expression } \description{ capture name of variable } \examples{ x <- 1 eval(.(x)[[1]]) } ================================================ FILE: man/downloadGEObedFiles.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/GEO.R \name{downloadGEObedFiles} \alias{downloadGEObedFiles} \title{downloadGEObedFiles} \usage{ downloadGEObedFiles(genome, destDir = getwd()) } \arguments{ \item{genome}{genome version} \item{destDir}{destination folder} } \description{ download all BED files of a particular genome version } \author{ G Yu } ================================================ FILE: man/downloadGSMbedFiles.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/GEO.R \name{downloadGSMbedFiles} \alias{downloadGSMbedFiles} \title{downloadGSMbedFiles} \usage{ downloadGSMbedFiles(GSM, destDir = getwd()) } \arguments{ \item{GSM}{GSM accession numbers} \item{destDir}{destination folder} } \description{ download BED supplementary files of a list of GSM accession numbers } \author{ G Yu } ================================================ FILE: man/dropAnno.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/annotatePeak.R \name{dropAnno} \alias{dropAnno} \title{dropAnno} \usage{ dropAnno(csAnno, distanceToTSS_cutoff = 10000) } \arguments{ \item{csAnno}{output of annotatePeak} \item{distanceToTSS_cutoff}{distance to TSS cutoff} } \value{ csAnno object } \description{ dropAnno } \details{ drop annotation exceeding distanceToTSS_cutoff } \author{ Guangchuang Yu } ================================================ FILE: man/enrichAnnoOverlap.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/enrichOverlap.R \name{enrichAnnoOverlap} \alias{enrichAnnoOverlap} \title{enrichAnnoOverlap} \usage{ enrichAnnoOverlap( queryPeak, targetPeak, TxDb = NULL, pAdjustMethod = "BH", chainFile = NULL, distanceToTSS_cutoff = NULL ) } \arguments{ \item{queryPeak}{query bed file} \item{targetPeak}{target bed file(s) or folder containing bed files} \item{TxDb}{TxDb} \item{pAdjustMethod}{pvalue adjustment method} \item{chainFile}{chain file for liftOver} \item{distanceToTSS_cutoff}{restrict nearest gene annotation by distance cutoff} } \value{ data.frame } \description{ calcuate overlap significant of ChIP experiments based on their nearest gene annotation } \author{ G Yu } ================================================ FILE: man/enrichPeakOverlap.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/enrichOverlap.R \name{enrichPeakOverlap} \alias{enrichPeakOverlap} \title{enrichPeakOverlap} \usage{ enrichPeakOverlap( queryPeak, targetPeak, TxDb = NULL, pAdjustMethod = "BH", nShuffle = 1000, chainFile = NULL, pool = TRUE, mc.cores = detectCores() - 1, verbose = TRUE ) } \arguments{ \item{queryPeak}{query bed file or GRanges object} \item{targetPeak}{target bed file(s) or folder that containing bed files or a list of GRanges objects} \item{TxDb}{TxDb} \item{pAdjustMethod}{pvalue adjustment method} \item{nShuffle}{shuffle numbers} \item{chainFile}{chain file for liftOver} \item{pool}{logical, whether pool target peaks} \item{mc.cores}{number of cores, see \link[parallel]{mclapply}} \item{verbose}{logical} } \value{ data.frame } \description{ calculate overlap significant of ChIP experiments based on the genome coordinations } \author{ G Yu } ================================================ FILE: man/getAnnoStat.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/csAnno.R \name{getAnnoStat} \alias{getAnnoStat} \title{getAnnoStat} \usage{ getAnnoStat(x) } \arguments{ \item{x}{csAnno object} } \description{ getting status of annotation } ================================================ FILE: man/getBioRegion.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tagMatrix.R \name{getBioRegion} \alias{getBioRegion} \title{getBioRegion} \usage{ getBioRegion( TxDb = NULL, upstream = 1000, downstream = 1000, by = "gene", type = "start_site" ) } \arguments{ \item{TxDb}{TxDb} \item{upstream}{upstream from start site or end site} \item{downstream}{downstream from start site or end site} \item{by}{one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR', 'UTR'} \item{type}{one of "start_site", "end_site", "body"} } \value{ GRanges object } \description{ prepare a bioregion of selected feature } \details{ this function combined previous functions getPromoters(), getBioRegion() and getGeneBody() in order to solve the following issues. (1) \url{https://github.com/GuangchuangYu/ChIPseeker/issues/16} (2) \url{https://github.com/GuangchuangYu/ChIPseeker/issues/87} The getBioRegion() function can prevoid a region of interest from \code{txdb} object. There are three kinds of regions, \code{start_site}, \code{end_site} and \code{body}. We take transcript region to expain the differences of these three regions. tx: chr1 1000 1400. \code{body} region refers to the 1000-1400bp. \code{start_site} region with \code{upstream = 100, downstream = 100} refers to 900-1100bp. \code{end_site} region with \code{upstream = 100, downstream = 100} refers to 1300-1500bp. } \author{ Guangchuang Yu, Ming L } ================================================ FILE: man/getGEOInfo.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/GEO.R \name{getGEOInfo} \alias{getGEOInfo} \title{getGEOInfo} \usage{ getGEOInfo(genome, simplify = TRUE) } \arguments{ \item{genome}{genome version} \item{simplify}{simplify result or not} } \value{ data.frame } \description{ get subset of GEO information by genome version keyword } \author{ G Yu } ================================================ FILE: man/getGEOgenomeVersion.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/GEO.R \name{getGEOgenomeVersion} \alias{getGEOgenomeVersion} \title{getGEOgenomeVersion} \usage{ getGEOgenomeVersion() } \value{ data.frame } \description{ get genome version statistics collecting from GEO ChIPseq data } \author{ G Yu } ================================================ FILE: man/getGEOspecies.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/GEO.R \name{getGEOspecies} \alias{getGEOspecies} \title{getGEOspecies} \usage{ getGEOspecies() } \value{ data.frame } \description{ accessing species statistics collecting from GEO database } \author{ G Yu } ================================================ FILE: man/getGeneAnno.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/addGeneAnno.R \name{getGeneAnno} \alias{getGeneAnno} \title{getGeneAnno} \usage{ getGeneAnno(annoDb, geneID, type, columns) } \arguments{ \item{annoDb}{annotation package} \item{geneID}{query geneID} \item{type}{gene ID type} \item{columns}{names of columns to be obtained from database} } \value{ data.frame } \description{ get gene annotation, symbol, gene name etc. } \author{ G Yu } ================================================ FILE: man/getGenomicAnnotation.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/getGenomicAnnotation.R \name{getGenomicAnnotation} \alias{getGenomicAnnotation} \title{getGenomicAnnotation} \usage{ getGenomicAnnotation( peaks, distance, tssRegion = c(-3000, 3000), TxDb, level, genomicAnnotationPriority, sameStrand = FALSE ) } \arguments{ \item{peaks}{peaks in GRanges object} \item{distance}{distance of peak to TSS} \item{tssRegion}{tssRegion, default is -3kb to +3kb} \item{TxDb}{TxDb object} \item{level}{one of gene or transcript} \item{genomicAnnotationPriority}{genomic Annotation Priority} \item{sameStrand}{whether annotate gene in same strand} } \value{ character vector } \description{ get Genomic Annotation of peaks } \author{ G Yu } ================================================ FILE: man/getNearestFeatureIndicesAndDistances.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/getNearestFeatureIndicesAndDistances.R \name{getNearestFeatureIndicesAndDistances} \alias{getNearestFeatureIndicesAndDistances} \title{getNearestFeatureIndicesAndDistances} \usage{ getNearestFeatureIndicesAndDistances( peaks, features, sameStrand = FALSE, ignoreOverlap = FALSE, ignoreUpstream = FALSE, ignoreDownstream = FALSE, overlap = "TSS" ) } \arguments{ \item{peaks}{peak in GRanges} \item{features}{features in GRanges} \item{sameStrand}{logical, whether find nearest gene in the same strand} \item{ignoreOverlap}{logical, whether ignore overlap of TSS with peak} \item{ignoreUpstream}{logical, if True only annotate gene at the 3' of the peak.} \item{ignoreDownstream}{logical, if True only annotate gene at the 5' of the peak.} \item{overlap}{one of "TSS" or "all"} } \value{ list } \description{ get index of features that closest to peak and calculate distance } \author{ G Yu } ================================================ FILE: man/getPromoters.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tagMatrix.R \name{getPromoters} \alias{getPromoters} \title{getPromoters} \usage{ getPromoters(TxDb = NULL, upstream = 1000, downstream = 1000, by = "gene") } \arguments{ \item{TxDb}{TxDb} \item{upstream}{upstream from TSS site} \item{downstream}{downstream from TSS site} \item{by}{one of gene or transcript} } \value{ GRanges object } \description{ prepare the promoter regions } ================================================ FILE: man/getSampleFiles.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utilities.R \name{getSampleFiles} \alias{getSampleFiles} \title{getSampleFiles} \usage{ getSampleFiles() } \value{ list of file names } \description{ get filenames of sample files } \author{ G Yu } ================================================ FILE: man/getTagMatrix.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tagMatrix.R \name{getTagMatrix} \alias{getTagMatrix} \title{getTagMatrix} \usage{ getTagMatrix( peak, upstream, downstream, windows, type, by, TxDb = NULL, weightCol = NULL, nbin = NULL, verbose = TRUE, ignore_strand = FALSE ) } \arguments{ \item{peak}{peak peak file or GRanges object} \item{upstream}{the distance of upstream extension} \item{downstream}{the distance of downstream extension} \item{windows}{a collection of region} \item{type}{one of "start_site", "end_site", "body"} \item{by}{one of 'gene', 'transcript', 'exon', 'intron', '3UTR' , '5UTR', or specified by users} \item{TxDb}{TxDb or self-made granges object, served as txdb} \item{weightCol}{column name of weight, default is NULL} \item{nbin}{the amount of nbines} \item{verbose}{print message or not} \item{ignore_strand}{ignore the strand information or not} } \value{ tagMatrix } \description{ calculate the tag matrix } \details{ \code{getTagMatrix()} function can produce the matrix for visualization. \code{peak} stands for the peak file. \code{window} stands for a collection of regions that users want to look into. Users can use \code{window} to capture the peak of interest. There are two ways to input \code{window}. The first way is that users can use \code{getPromoters()/getBioRegion()/makeBioRegionFromGranges()} to get \code{window} and put it into \code{getTagMatrix()}. The second way is that users can use \code{getTagMatrix()} to call \code{getPromoters()/getBioRegion()/makeBioRegionFromGranges()}. In this way users do not need to input \code{window} parameter but they need to input \code{txdb}. \code{txdb} is a set of packages contained annotation of regions of different genomes. Users can get the regions of interest through specific functions. These specific functions are built in \code{getPromoters()/getBioRegion()}. Many regions can not be gain through \code{txdb}, like insulator and enhancer regions. Users can provide these regions in the form of granges object. These self-made granges object will be passed to \code{TxDb} parameter and they will be passed to \code{makeBioRegionFromGranges()} to produce the \code{window}. In a word, \code{TxDb} parameter is a reference information. Users can pass \code{txdb object} or self-made granges into it. Details see \code{\link{getPromoters}},\code{\link{getBioRegion}} and \code{\link{makeBioRegionFromGranges}} \code{upstream} and \code{downstream} parameter have different usages: (1) \code{window} parameter is provided, if \code{type == 'body'}, \code{upstream} and \code{downstream} can use to extend the flank of body region. if \code{type == 'start_site'/'end_site'}, \code{upstream} and \code{downstream} do not play a role in \code{getTagMatrix()} function. (2) \code{window} parameter is missing, if \code{type == 'body'}, \code{upstream} and \code{downstream} can use to extend the flank of body region. if \code{type == 'start_site'/'end_site'}, \code{upstream} and \code{downstream} refer to the upstream and downstream of the start_site or the end_site. \code{weightCol} refers to column in peak file. This column acts as a weight vaule. Details see \url{https://github.com/YuLab-SMU/ChIPseeker/issues/15} \code{nbin} refers to the number of bins. \code{getTagMatrix()} provide a binning method to get the tag matrix. } ================================================ FILE: man/getTagMatrix.binning.internal.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tagMatrix.R \name{getTagMatrix.binning.internal} \alias{getTagMatrix.binning.internal} \title{getTagMatrix.binning.internal} \usage{ getTagMatrix.binning.internal( peak, weightCol = NULL, windows, nbin = 800, upstream = NULL, downstream = NULL, ignore_strand = FALSE ) } \arguments{ \item{peak}{peak peak file or GRanges object} \item{weightCol}{weightCol column name of weight, default is NULL} \item{windows}{windows a collection of region with equal or not equal size, eg. promoter region, gene region.} \item{nbin}{the amount of nbines needed to be splited and it should not be more than min_body_length} \item{upstream}{rel object, NULL or actual number} \item{downstream}{rel object, NULL or actual number} \item{ignore_strand}{ignore the strand information or not} } \value{ tagMatrix } \description{ calculate the tagMatrix by binning the idea was derived from the function of deeptools https://deeptools.readthedocs.io/en/develop/content/tools/computeMatrix.html } ================================================ FILE: man/getTagMatrix.internal.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tagMatrix.R \name{getTagMatrix.internal} \alias{getTagMatrix.internal} \title{getTagMatrix.internal} \usage{ getTagMatrix.internal(peak, weightCol = NULL, windows, ignore_strand = FALSE) } \arguments{ \item{peak}{peak file or GRanges object} \item{weightCol}{column name of weight, default is NULL} \item{windows}{a collection of region with equal size, eg. promoter region.} \item{ignore_strand}{ignore the strand information or not} } \value{ tagMatrix } \description{ calculate the tag matrix } \author{ G Yu } ================================================ FILE: man/getTagMatrix2.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tagMatrix.R \name{getTagMatrix2} \alias{getTagMatrix2} \title{getTagMatrix2} \usage{ getTagMatrix2( peak, upstream, downstream, windows_name, type, by, TxDb = NULL, weightCol = NULL, nbin = NULL, verbose = TRUE, ignore_strand = FALSE ) } \arguments{ \item{peak}{peak peak file or GRanges object} \item{upstream}{the distance of upstream extension} \item{downstream}{the distance of downstream extension} \item{windows_name}{the names of windows} \item{type}{one of "start_site", "end_site", "body"} \item{by}{one of 'gene', 'transcript', 'exon', 'intron', '3UTR' , '5UTR', or specified by users} \item{TxDb}{TxDb or self-made granges object, served as txdb} \item{weightCol}{column name of weight, default is NULL} \item{nbin}{the amount of nbines} \item{verbose}{print message or not} \item{ignore_strand}{ignore the strand information or not} } \value{ tagMatrix } \description{ Nested function for getTagMatrix() to deal with multiple windows } \details{ This is an internal function. } ================================================ FILE: man/getTagMatrix2.binning.internal.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tagMatrix.R \name{getTagMatrix2.binning.internal} \alias{getTagMatrix2.binning.internal} \title{internal function} \usage{ getTagMatrix2.binning.internal( peak, weightCol = NULL, windows, windows_name, nbin = 800, upstream = NULL, downstream = NULL, ignore_strand = FALSE ) } \arguments{ \item{peak}{peak peak file or GRanges object} \item{weightCol}{column name of weight, default is NULL} \item{windows}{a collection of region} \item{windows_name}{the name of windows} \item{nbin}{the amount of nbines} \item{upstream}{the distance of upstream extension} \item{downstream}{the distance of downstream extension} \item{ignore_strand}{ignore the strand information or not} } \description{ internal function } ================================================ FILE: man/getTagMatrix2.internal.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tagMatrix.R \name{getTagMatrix2.internal} \alias{getTagMatrix2.internal} \title{getTagMatrix2.internal} \usage{ getTagMatrix2.internal( peak, weightCol = NULL, windows, windows_name, ignore_strand = FALSE ) } \arguments{ \item{peak}{peak peak file or GRanges object} \item{weightCol}{column name of weight, default is NULL} \item{windows}{a collection of region} \item{windows_name}{the name of windows} \item{ignore_strand}{ignore the strand information or not} } \description{ getTagMatrix2.internal } ================================================ FILE: man/info.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ChIPseeker-package.R \docType{data} \name{info} \alias{info} \alias{ucsc_release} \alias{gsminfo} \alias{tagMatrixList} \title{Information Datasets} \description{ ucsc genome version, precalcuated data and gsm information } \keyword{datasets} ================================================ FILE: man/makeBioRegionFromGranges.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tagMatrix.R \name{makeBioRegionFromGranges} \alias{makeBioRegionFromGranges} \title{makeBioRegionFromGranges} \usage{ makeBioRegionFromGranges(gr, by, type, upstream = 1000, downstream = 1000) } \arguments{ \item{gr}{a grange object contain region of interest} \item{by}{specify be users, e.g. gene, insulator, enhancer} \item{type}{one of "start_site", "end_site", "body"} \item{upstream}{upstream from start site or end site, can be NULL if the type == 'body'} \item{downstream}{downstream from start site or end site, can be NULL if the type == 'body'} } \value{ GRanges object } \description{ make windows from granges object } \details{ \code{makeBioRegionFromGranges()} function can make bioregion from granges object. The differences between \code{makeBioRegionFromGranges()} and \code{getBioRegion()} is that \code{getBioRegion()} get the region object from \code{txdb} object but \code{makeBioRegionFromGranges()} get the region from the granges object provided by users. For example, \code{txdb} object do not contain insulator or enhancer regions. Users can provide these regions through self-made granges object There are three kinds of regions, \code{start_site}, \code{end_site} and \code{body}. We take enhancer region to explain the differences of these three regions. enhancer: chr1 1000 1400. \code{body} region refers to the 1000-1400bp. \code{start_site} region with \code{upstream = 100, downstream = 100} refers to 900-1100bp. \code{end_site} region with \code{upstream = 100, downstream = 100} refers to 1300-1500bp. In \code{makeBioRegionFromGranges()}, \code{upstream} and \code{downstream} can be \code{NULL} if the \code{type == 'body'}. \code{by} should be specified by users and can not be omitted. \code{by} parameter will be used to made labels. \code{type} should also be specified. \url{https://github.com/YuLab-SMU/ChIPseeker/issues/189} } ================================================ FILE: man/make_label.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utilities.R \name{make_label} \alias{make_label} \title{make label for figures} \usage{ make_label(type, by) } \arguments{ \item{type}{one of "start_site", "end_site", "body"} \item{by}{one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR', 'UTR'} } \description{ make label for figures } ================================================ FILE: man/overlap.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utilities.R \name{overlap} \alias{overlap} \title{overlap} \usage{ overlap(Sets) } \arguments{ \item{Sets}{a list of objects} } \value{ data.frame } \description{ calculate the overlap matrix, which is useful for vennplot } \author{ G Yu } ================================================ FILE: man/peakHeatmap.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotTagMatrix.R \name{peakHeatmap} \alias{peakHeatmap} \title{peakHeatmap} \usage{ peakHeatmap( peak, weightCol = NULL, TxDb = NULL, upstream = 1000, downstream = 1000, xlab = "", ylab = "", title = NULL, palette = NULL, verbose = TRUE, by = "gene", type = "start_site", nbin = NULL, ignore_strand = FALSE, windows, ncol = NULL, nrow = NULL ) } \arguments{ \item{peak}{peak file or GRanges object} \item{weightCol}{column name of weight} \item{TxDb}{TxDb object} \item{upstream}{upstream position} \item{downstream}{downstream position} \item{xlab}{xlab} \item{ylab}{ylab} \item{title}{title} \item{palette}{palette to be filled in,details see \link[ggplot2]{scale_colour_brewer}} \item{verbose}{print message or not} \item{by}{one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR', 'UTR'} \item{type}{one of "start_site", "end_site", "body"} \item{nbin}{the amount of nbines} \item{ignore_strand}{ignore the strand information or not} \item{windows}{a collection of region} \item{ncol}{the ncol of plotting a list of peak} \item{nrow}{the nrow of plotting a list of peak} } \value{ figure } \description{ plot the heatmap of peaks } \author{ G Yu } ================================================ FILE: man/peakHeatmap_multiple_Sets.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotTagMatrix.R \name{peakHeatmap_multiple_Sets} \alias{peakHeatmap_multiple_Sets} \title{peakHeatmap} \usage{ peakHeatmap_multiple_Sets( peak, weightCol = NULL, TxDb = NULL, upstream = 1000, downstream = 1000, xlab = "", ylab = "", title = NULL, palette = NULL, verbose = TRUE, by = "gene", type = "start_site", nbin = NULL, ignore_strand = FALSE, windows_name = NULL, ncol = NULL, nrow = NULL, facet_label_text_size = 12 ) } \arguments{ \item{peak}{peak file or GRanges object} \item{weightCol}{column name of weight} \item{TxDb}{TxDb object} \item{upstream}{upstream position} \item{downstream}{downstream position} \item{xlab}{xlab} \item{ylab}{ylab} \item{title}{title} \item{palette}{palette to be filled in,details see \link[ggplot2]{scale_colour_brewer}} \item{verbose}{print message or not} \item{by}{one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR', 'UTR'} \item{type}{one of "start_site", "end_site", "body"} \item{nbin}{the amount of nbines} \item{ignore_strand}{ignore the strand information or not} \item{windows_name}{the name for each window, which will also be showed in the picture as labels} \item{ncol}{the ncol of plotting a list of peak} \item{nrow}{the nrow of plotting a list of peak} \item{facet_label_text_size}{the size of facet label text} } \value{ figure } \description{ plot the heatmap of peaks align to a sets of regions } ================================================ FILE: man/peak_Profile_Heatmap.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotTagMatrix.R \name{peak_Profile_Heatmap} \alias{peak_Profile_Heatmap} \title{peak_Profile_Heatmap} \usage{ peak_Profile_Heatmap( peak, weightCol = NULL, TxDb = NULL, upstream = 1000, downstream = 1000, xlab = "", ylab = "", title = NULL, palette = NULL, verbose = TRUE, by = "gene", type = "start_site", nbin = NULL, ignore_strand = FALSE, windows_name = NULL, ncol = NULL, nrow = NULL, facet_label_text_size = 12, conf, facet = "row", free_y = TRUE, height_proportion = 4 ) } \arguments{ \item{peak}{peak file or GRanges object} \item{weightCol}{column name of weight} \item{TxDb}{TxDb object} \item{upstream}{upstream position} \item{downstream}{downstream position} \item{xlab}{xlab} \item{ylab}{ylab} \item{title}{title} \item{palette}{palette to be filled in,details see \link[ggplot2]{scale_colour_brewer}} \item{verbose}{print message or not} \item{by}{one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR', 'UTR'} \item{type}{one of "start_site", "end_site", "body"} \item{nbin}{the amount of nbines} \item{ignore_strand}{ignore the strand information or not} \item{windows_name}{the name for each window, which will also be showed in the picture as labels} \item{ncol}{the ncol of plotting a list of peak} \item{nrow}{the nrow of plotting a list of peak} \item{facet_label_text_size}{the size of facet label text} \item{conf}{confidence interval} \item{facet}{one of 'none', 'row' and 'column'} \item{free_y}{if TRUE, y will be scaled by AvgProf} \item{height_proportion}{the proportion of profiling picture and heatmap} } \description{ plot peak heatmap and profile in a picture } ================================================ FILE: man/plotAnnoBar-methods.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/AllGenerics.R, R/csAnno.R \docType{methods} \name{plotAnnoBar} \alias{plotAnnoBar} \alias{plotAnnoBar,list-method} \alias{plotAnnoBar,csAnno,ANY-method} \title{plotAnnoBar method generics} \usage{ plotAnnoBar( x, xlab = "", ylab = "Percentage(\%)", title = "Feature Distribution", ... ) \S4method{plotAnnoBar}{list}( x, xlab = "", ylab = "Percentage(\%)", title = "Feature Distribution", ... ) plotAnnoBar(x, xlab="", ylab='Percentage(\%)',title="Feature Distribution", ...) } \arguments{ \item{x}{\code{csAnno} instance} \item{xlab}{xlab} \item{ylab}{ylab} \item{title}{title} \item{...}{additional paramter} } \value{ plot } \description{ plotAnnoBar method for \code{csAnno} instance } \author{ Guangchuang Yu \url{https://guangchuangyu.github.io} } ================================================ FILE: man/plotAnnoBar.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotAnno.R \name{plotAnnoBar.data.frame} \alias{plotAnnoBar.data.frame} \title{plotAnnoBar.data.frame} \usage{ plotAnnoBar.data.frame( anno.df, xlab = "", ylab = "Percentage(\%)", title = "Feature Distribution", categoryColumn ) } \arguments{ \item{anno.df}{annotation stats} \item{xlab}{xlab} \item{ylab}{ylab} \item{title}{plot title} \item{categoryColumn}{category column} } \value{ bar plot that summarize genomic features of peaks } \description{ plot feature distribution based on their chromosome region } \details{ plot chromosome region features } \seealso{ \code{\link{annotatePeak}} \code{\link{plotAnnoPie}} } \author{ Guangchuang Yu \url{https://yulab-smu.top} } ================================================ FILE: man/plotAnnoPie-methods.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/AllGenerics.R, R/csAnno.R \docType{methods} \name{plotAnnoPie} \alias{plotAnnoPie} \alias{plotAnnoPie,csAnno,ANY-method} \title{plotAnnoPie method generics} \usage{ plotAnnoPie( x, ndigit = 2, cex = 0.9, col = NA, legend.position = "rightside", pie3D = FALSE, radius = 0.8, ... ) plotAnnoPie(x,ndigit=2,cex=0.9,col=NA,legend.position="rightside",pie3D=FALSE,radius=0.8,...) } \arguments{ \item{x}{\code{csAnno} instance} \item{ndigit}{number of digit to round} \item{cex}{label cex} \item{col}{color} \item{legend.position}{topright or other.} \item{pie3D}{plot in 3D or not} \item{radius}{radius of the pie} \item{...}{extra parameter} } \value{ plot } \description{ plotAnnoPie method for \code{csAnno} instance } \author{ Guangchuang Yu \url{https://guangchuangyu.github.io} } ================================================ FILE: man/plotAnnoPie.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotAnno.R \name{plotAnnoPie.csAnno} \alias{plotAnnoPie.csAnno} \title{plotAnnoPie} \usage{ plotAnnoPie.csAnno( x, ndigit = 2, cex = 0.8, col = NA, legend.position = "rightside", pie3D = FALSE, radius = 0.8, ... ) } \arguments{ \item{x}{csAnno object} \item{ndigit}{number of digit to round} \item{cex}{label cex} \item{col}{color} \item{legend.position}{topright or other.} \item{pie3D}{plot in 3D or not} \item{radius}{radius of Pie} \item{...}{extra parameter} } \value{ pie plot of peak genomic feature annotation } \description{ pieplot from peak genomic annotation } \examples{ \dontrun{ require(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene peakfile <- system.file("extdata", "sample_peaks.txt", package="chipseeker") peakAnno <- annotatePeak(peakfile, TxDb=txdb) plotAnnoPie(peakAnno) } } \seealso{ \code{\link{annotatePeak}} \code{\link{plotAnnoBar}} } \author{ Guangchuang Yu \url{https://yulab-smu.top} } ================================================ FILE: man/plotAvgProf.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotTagMatrix.R \name{plotAvgProf} \alias{plotAvgProf} \title{plotAvgProf} \usage{ plotAvgProf( tagMatrix, xlim, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", conf, facet = "none", free_y = TRUE, origin_label = "TSS", verbose = TRUE, ... ) } \arguments{ \item{tagMatrix}{tagMatrix or a list of tagMatrix} \item{xlim}{xlim} \item{xlab}{x label} \item{ylab}{y label} \item{conf}{confidence interval} \item{facet}{one of 'none', 'row' and 'column'} \item{free_y}{if TRUE, y will be scaled by AvgProf} \item{origin_label}{label of the center} \item{verbose}{print message or not} \item{...}{additional parameter} } \value{ ggplot object } \description{ plot the profile of peaks } \author{ G Yu; Y Yan } ================================================ FILE: man/plotAvgProf.binning.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotTagMatrix.R \name{plotAvgProf.binning} \alias{plotAvgProf.binning} \title{plotAvgProf.binning} \usage{ plotAvgProf.binning( tagMatrix, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", conf, facet = "none", free_y = TRUE, upstream = NULL, downstream = NULL, label, ... ) } \arguments{ \item{tagMatrix}{tagMatrix or a list of tagMatrix} \item{xlab}{x label} \item{ylab}{y label} \item{conf}{confidence interval} \item{facet}{one of 'none', 'row' and 'column'} \item{free_y}{if TRUE, y will be scaled} \item{upstream}{rel object reflects the percentage of flank extension, e.g rel(0.2) integer reflects the actual length of flank extension or TSS region NULL reflects the gene body with no extension} \item{downstream}{rel object reflects the percentage of flank extension, e.g rel(0.2) integer reflects the actual length of flank extension or TSS region NULL reflects the gene body with no extension} \item{label}{label} \item{...}{additional parameter} } \value{ ggplot object } \description{ plot the profile of peaks by binning } ================================================ FILE: man/plotAvgProf2.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotTagMatrix.R \name{plotAvgProf2} \alias{plotAvgProf2} \title{plotAvgProf} \usage{ plotAvgProf2( peak, weightCol = NULL, TxDb = NULL, upstream = 1000, downstream = 1000, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", conf, facet = "none", free_y = TRUE, verbose = TRUE, ignore_strand = FALSE, ... ) } \arguments{ \item{peak}{peak file or GRanges object} \item{weightCol}{column name of weight} \item{TxDb}{TxDb object} \item{upstream}{upstream position} \item{downstream}{downstream position} \item{xlab}{xlab} \item{ylab}{ylab} \item{conf}{confidence interval} \item{facet}{one of 'none', 'row' and 'column'} \item{free_y}{if TRUE, y will be scaled by AvgProf} \item{verbose}{print message or not} \item{ignore_strand}{ignore the strand information or not} \item{...}{additional parameter} } \value{ ggplot object } \description{ plot the profile of peaks that align to flank sequences of TSS } \details{ This function is the old function of \code{plotPeakProf2}. It can only plot the start site region of gene. } \author{ G Yu, Ming L } ================================================ FILE: man/plotDistToTSS-methods.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/AllGenerics.R, R/csAnno.R \docType{methods} \name{plotDistToTSS} \alias{plotDistToTSS} \alias{plotDistToTSS,list-method} \alias{plotDistToTSS,csAnno,ANY-method} \title{plotDistToTSS method generics} \usage{ plotDistToTSS( x, distanceColumn = "distanceToTSS", xlab = "", ylab = "Binding sites (\%) (5'->3')", title = "Distribution of transcription factor-binding loci relative to TSS", ... ) \S4method{plotDistToTSS}{list}( x, distanceColumn = "distanceToTSS", xlab = "", ylab = "Binding sites (\%) (5'->3')", title = "Distribution of transcription factor-binding loci relative to TSS", distanceBreaks = c(0, 1000, 3000, 5000, 10000, 1e+05), palette = NULL, ... ) plotDistToTSS(x,distanceColumn="distanceToTSS", xlab="", ylab="Binding sites (\%) (5'->3')", title="Distribution of transcription factor-binding loci relative to TSS",...) } \arguments{ \item{x}{\code{csAnno} instance} \item{distanceColumn}{distance column name} \item{xlab}{xlab} \item{ylab}{ylab} \item{title}{title} \item{...}{additional parameter} \item{distanceBreaks}{breaks of distance, default is 'c(0, 1000, 3000, 5000, 10000, 100000)'} \item{palette}{palette name for coloring different distances. Run `RColorBrewer::display.brewer.all()` to see all applicable values.} } \value{ plot } \description{ plotDistToTSS method for \code{csAnno} instance } \author{ Guangchuang Yu \url{https://guangchuangyu.github.io} } ================================================ FILE: man/plotDistToTSS.data.frame.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotDistToTSS.R \name{plotDistToTSS.data.frame} \alias{plotDistToTSS.data.frame} \title{plotDistToTSS.data.frame} \usage{ plotDistToTSS.data.frame( peakDist, distanceColumn = "distanceToTSS", distanceBreaks = c(0, 1000, 3000, 5000, 10000, 1e+05), palette = NULL, xlab = "", ylab = "Binding sites (\%) (5'->3')", title = "Distribution of transcription factor-binding loci relative to TSS", categoryColumn = ".id" ) } \arguments{ \item{peakDist}{peak annotation} \item{distanceColumn}{column name of the distance from peak to nearest gene} \item{distanceBreaks}{default is 'c(0, 1000, 3000, 5000, 10000, 100000)'} \item{palette}{palette name for coloring different distances. Run `RColorBrewer::display.brewer.all()` to see all applicable values.} \item{xlab}{x label} \item{ylab}{y lable} \item{title}{figure title} \item{categoryColumn}{category column, default is ".id"} } \value{ bar plot that summarize distance from peak to TSS of the nearest gene. } \description{ plot feature distribution based on the distances to the TSS } \examples{ \dontrun{ require(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene peakfile <- system.file("extdata", "sample_peaks.txt", package="ChIPseeker") peakAnno <- annotatePeak(peakfile, TxDb=txdb) plotDistToTSS(peakAnno) } } \seealso{ \code{\link{annotatePeak}} } \author{ Guangchuang Yu \url{https://guangchuangyu.github.io} } ================================================ FILE: man/plotMultiProf.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotTagMatrix.R \name{plotMultiProf} \alias{plotMultiProf} \title{internal function for plotPeakProf_MultiWindows} \usage{ plotMultiProf( tagMatrix, conf, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", facet = "none", free_y = TRUE, ... ) } \arguments{ \item{tagMatrix}{tagMatrix} \item{conf}{confidence interval} \item{xlab}{xlab} \item{ylab}{ylab} \item{facet}{one of 'none', 'row' and 'column'} \item{free_y}{if TRUE, y will be scaled by AvgProf} \item{...}{additional parameter} } \description{ internal function for plotPeakProf_MultiWindows } ================================================ FILE: man/plotMultiProf.binning.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotTagMatrix.R \name{plotMultiProf.binning} \alias{plotMultiProf.binning} \title{internal function} \usage{ plotMultiProf.binning( tagMatrix, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", conf, facet = "none", free_y = TRUE, upstream = NULL, downstream = NULL, label, ... ) } \arguments{ \item{tagMatrix}{tagMatrix} \item{xlab}{xlab} \item{ylab}{ylab} \item{conf}{confidence interval} \item{facet}{one of 'none', 'row' and 'column'} \item{free_y}{if TRUE, y will be scaled by AvgProf} \item{upstream}{the upstream extension} \item{downstream}{the downstream extension} \item{label}{the label of the center} \item{...}{additional parameter} } \description{ internal function } ================================================ FILE: man/plotMultiProf.binning.internal.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotTagMatrix.R \name{plotMultiProf.binning.internal} \alias{plotMultiProf.binning.internal} \title{internal function} \usage{ plotMultiProf.binning.internal( tagMatrix, conf, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", facet = "none", free_y = TRUE, upstream = NULL, downstream = NULL, label, ... ) } \arguments{ \item{tagMatrix}{tagMatrix} \item{conf}{confidence interval} \item{xlab}{xlab} \item{ylab}{ylab} \item{facet}{one of 'none', 'row' and 'column'} \item{free_y}{if TRUE, y will be scaled by AvgProf} \item{upstream}{the upstream extension} \item{downstream}{the downstream extension} \item{label}{the label of the center} \item{...}{additional parameter} } \description{ internal function } ================================================ FILE: man/plotMultiProf.normal.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotTagMatrix.R \name{plotMultiProf.normal} \alias{plotMultiProf.normal} \title{internal function} \usage{ plotMultiProf.normal( tagMatrix, xlim, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", conf, facet = "none", free_y = TRUE, origin_label = "TSS", verbose = TRUE, ... ) } \arguments{ \item{tagMatrix}{tagMatrix} \item{xlim}{xlim} \item{xlab}{xlab} \item{ylab}{ylab} \item{conf}{confidence interval} \item{facet}{one of 'none', 'row' and 'column'} \item{free_y}{if TRUE, y will be scaled by AvgProf} \item{origin_label}{the label of the center} \item{verbose}{print message or not} \item{...}{additional parameter} } \description{ internal function } ================================================ FILE: man/plotMultiProf.normal.internal.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotTagMatrix.R \name{plotMultiProf.normal.internal} \alias{plotMultiProf.normal.internal} \title{internal function} \usage{ plotMultiProf.normal.internal( tagMatrix, conf, xlim = c(-3000, 3000), xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", facet = "row", free_y = TRUE, origin_label, ... ) } \arguments{ \item{tagMatrix}{tagMatrix} \item{conf}{confidence interval} \item{xlim}{xlim} \item{xlab}{xlab} \item{ylab}{ylab} \item{facet}{one of 'none', 'row' and 'column'} \item{free_y}{if TRUE, y will be scaled by AvgProf} \item{origin_label}{the label of the center} \item{...}{additional parameter} } \description{ internal function } ================================================ FILE: man/plotPeakProf.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotTagMatrix.R \name{plotPeakProf} \alias{plotPeakProf} \title{plotPeakProf_MultiWindows} \usage{ plotPeakProf( tagMatrix = NULL, peak, upstream, downstream, conf, by, type, windows_name = NULL, weightCol = NULL, TxDb = NULL, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", facet = "row", free_y = TRUE, verbose = TRUE, nbin = NULL, ignore_strand = FALSE, ... ) } \arguments{ \item{tagMatrix}{tagMatrix or a list of tagMatrix} \item{peak}{peak file or GRanges object} \item{upstream}{upstream position} \item{downstream}{downstream position} \item{conf}{confidence interval} \item{by}{feature of interest} \item{type}{one of "start_site", "end_site", "body"} \item{windows_name}{the name for each window, which will also be showed in the picture as labels} \item{weightCol}{column name of weight} \item{TxDb}{TxDb object or self-made granges objects} \item{xlab}{xlab} \item{ylab}{ylab} \item{facet}{one of 'none', 'row' and 'column'} \item{free_y}{if TRUE, y will be scaled by AvgProf} \item{verbose}{print message or not} \item{nbin}{the amount of bines} \item{ignore_strand}{ignore the strand information or not} \item{...}{additional parameter} } \value{ ggplot object } \description{ plot the profile of peaks ` \code{plotPeakProf_MultiWindows()} is almost the same as \code{plotPeakProf2()}, having the main difference of accepting two or more granges objects. Accepting more granges objects can help compare the same peaks in different windows. } \details{ \code{TxDb} parameter can accept txdb object. But many regions can not be obtained by txdb object. In this case, Users can provide self-made granges served the same role as txdb object and pass to \code{TxDb} object. \code{by} the features of interest. (1) if users use \code{txdb}, \code{by} can be one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR', 'UTR'. These features can be obtained by functions from txdb object. (2) if users use self-made granges object, \code{by} can be everything. Because this \code{by} will not pass to functions to get features, which is different from the case of using txdb object. This \code{by} is only used to made labels showed in picture. \code{type} means the property of the region. one of the "start site", "end site" and "body". \code{upstream} and \code{downstream} parameter have different usages: (1) if \code{type == 'body'}, \code{upstream} and \code{downstream} can use to extend the flank of body region. (2) if \code{type == 'start_site'/'end_site'}, \code{upstream} and \code{downstream} refer to the upstream and downstream of the start_site or the end_site. \code{weightCol} refers to column in peak file. This column acts as a weight value. Details see \url{https://github.com/YuLab-SMU/ChIPseeker/issues/15} \code{nbin} refers to the number of bins. \code{getTagMatrix()} provide a binning method to get the tag matrix. There are two ways input a list of window. (1) Users can input a list of self-made granges objects (2) Users can input a list of \code{by} and only one \code{type}. In this way, \code{plotPeakProf_MultiWindows()} can made a list of window from txdb object based on \code{by} and \code{type}. Warning: (1) All of these window should be the same type. It means users can only compare a list of "start site"/"end site"/"body region" with the same upstream and downstream. (2) So it will be only one \code{type} and several \code{by}. (3) Users can make window by txdb object or self-made granges object. Users can only choose one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR' or 'UTR' in the way of using txdb object. User can input any \code{by} in the way of using self-made granges object. (4) Users can mingle the \code{by} designed for the two ways. \code{plotPeakProf_MultiWindows} can accpet the hybrid \code{by}. But the above rules should be followed. \url{https://github.com/YuLab-SMU/ChIPseeker/issues/189} } ================================================ FILE: man/plotPeakProf2.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotTagMatrix.R \name{plotPeakProf2} \alias{plotPeakProf2} \title{plotPeakProf2} \usage{ plotPeakProf2( peak, upstream, downstream, conf, by, type, weightCol = NULL, TxDb = NULL, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", facet = "none", free_y = TRUE, verbose = TRUE, nbin = NULL, ignore_strand = FALSE, ... ) } \arguments{ \item{peak}{peak file or GRanges object} \item{upstream}{upstream position} \item{downstream}{downstream position} \item{conf}{confidence interval} \item{by}{e.g. 'gene', 'transcript', 'exon' or features of interest(e.g. "enhancer")} \item{type}{one of "start_site", "end_site", "body"} \item{weightCol}{column name of weight} \item{TxDb}{TxDb object, or self-made granges object} \item{xlab}{xlab} \item{ylab}{ylab} \item{facet}{one of 'none', 'row' and 'column'} \item{free_y}{if TRUE, y will be scaled by AvgProf} \item{verbose}{print message or not} \item{nbin}{the amount of nbines} \item{ignore_strand}{ignore the strand information or not} \item{...}{additional parameter} } \value{ ggplot object } \description{ plot the profile of peaks automatically } \details{ \code{peak} stands for the peak file. \code{by} the features of interest. (1) if users use \code{txdb}, \code{by} can be one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR', 'UTR'. These features can be obtained by functions from txdb object. (2) if users use self-made granges object, \code{by} can be everything. Because this \code{by} will not pass to functions to get features, which is different from the case of using txdb object. This \code{by} is only used to made labels showed in picture. \code{type} means the property of the region. one of the "start site", "end site" and "body". \code{upstream} and \code{downstream} parameter have different usages: (1) if \code{type == 'body'}, \code{upstream} and \code{downstream} can use to extend the flank of body region. (2) if \code{type == 'start_site'/'end_site'}, \code{upstream} and \code{downstream} refer to the upstream and downstream of the start_site or the end_site. \code{weightCol} refers to column in peak file. This column acts as a weight vaule. Details see \url{https://github.com/YuLab-SMU/ChIPseeker/issues/15} \code{nbin} refers to the number of bins, providing a binning method to get the tag matrix. \code{TxDb} parameter can accept txdb object. But many regions can not be obtained by txdb object. In this case, Users can provide self-made granges served the same role as txdb object and pass to \code{TxDb} object. \code{plotPeakProf2()} is different from the \code{plotPeakProf()}. \code{plotPeakProf2()} do not need to provide \code{window} parameter, which means \code{plotPeakProf2()} will call relevent functions to make \code{window} automatically. } \author{ G Yu, Ming Li } ================================================ FILE: man/plotPeakProf_MultiWindows.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotTagMatrix.R \name{plotPeakProf_MultiWindows} \alias{plotPeakProf_MultiWindows} \title{plotPeakProf_MultiWindows} \usage{ plotPeakProf_MultiWindows( peak, upstream, downstream, conf, by, type, windows_name = NULL, weightCol = NULL, TxDb = NULL, xlab = "Genomic Region (5'->3')", ylab = "Peak Count Frequency", facet = "row", free_y = TRUE, verbose = TRUE, nbin = NULL, ignore_strand = FALSE, ... ) } \arguments{ \item{peak}{peak file or GRanges object} \item{upstream}{upstream position} \item{downstream}{downstream position} \item{conf}{confidence interval} \item{by}{feature of interest} \item{type}{one of "start_site", "end_site", "body"} \item{windows_name}{the name for each window, which will also be showed in the picture as labels} \item{weightCol}{column name of weight} \item{TxDb}{TxDb object or self-made granges objects} \item{xlab}{xlab} \item{ylab}{ylab} \item{facet}{one of 'none', 'row' and 'column'} \item{free_y}{if TRUE, y will be scaled by AvgProf} \item{verbose}{print message or not} \item{nbin}{the amount of bines} \item{ignore_strand}{ignore the strand information or not} \item{...}{additional parameter} } \value{ ggplot object } \description{ plot the profile of peaks in two or more windows } \details{ This function comes from \url{https://github.com/YuLab-SMU/ChIPseeker/issues/189} ` \code{plotPeakProf_MultiWindows()} is almost the same as \code{plotPeakProf2()}, having the main difference of accepting two or more granges objects. Accepting more granges objects can help compare the same peaks in different windows. \code{TxDb} parameter can accept txdb object. But many regions can not be obtained by txdb object. In this case, Users can provide self-made granges served the same role as txdb object and pass to \code{TxDb} object. \code{by} the features of interest. (1) if users use \code{txdb}, \code{by} can be one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR', 'UTR'. These features can be obtained by functions from txdb object. (2) if users use self-made granges object, \code{by} can be everything. Because this \code{by} will not pass to functions to get features, which is different from the case of using txdb object. This \code{by} is only used to made labels showed in picture. \code{type} means the property of the region. one of the "start site", "end site" and "body". \code{upstream} and \code{downstream} parameter have different usages: (1) if \code{type == 'body'}, \code{upstream} and \code{downstream} can use to extend the flank of body region. (2) if \code{type == 'start_site'/'end_site'}, \code{upstream} and \code{downstream} refer to the upstream and downstream of the start_site or the end_site. \code{weightCol} refers to column in peak file. This column acts as a weight value. Details see \url{https://github.com/YuLab-SMU/ChIPseeker/issues/15} \code{nbin} refers to the number of bins. \code{getTagMatrix()} provide a binning method to get the tag matrix. There are two ways input a list of window. (1) Users can input a list of self-made granges objects (2) Users can input a list of \code{by} and only one \code{type}. In this way, \code{plotPeakProf_MultiWindows()} can made a list of window from txdb object based on \code{by} and \code{type}. Warning: (1) All of these window should be the same type. It means users can only compare a list of "start site"/"end site"/"body region" with the same upstream and downstream. (2) So it will be only one \code{type} and several \code{by}. (3) Users can make window by txdb object or self-made granges object. Users can only choose one of 'gene', 'transcript', 'exon', 'intron' , '3UTR' , '5UTR' or 'UTR' in the way of using txdb object. User can input any \code{by} in the way of using self-made granges object. (4) Users can mingle the \code{by} designed for the two ways. \code{plotPeakProf_MultiWindows} can accpet the hybrid \code{by}. But the above rules should be followed. } ================================================ FILE: man/readPeakFile.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/readPeakFile.R \name{readPeakFile} \alias{readPeakFile} \title{readPeakFile} \usage{ readPeakFile(peakfile, as = "GRanges", ...) } \arguments{ \item{peakfile}{peak file} \item{as}{output format, one of GRanges or data.frame} \item{...}{additional parameter (pass to `utils::read.delim()`)} } \value{ peak information, in GRanges or data.frame object } \description{ read peak file and store in data.frame or GRanges object } \examples{ peakfile <- system.file("extdata", "sample_peaks.txt", package="ChIPseeker") peak.gr <- readPeakFile(peakfile, as="GRanges") peak.gr } \author{ G Yu } ================================================ FILE: man/reexports.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utilities.R \docType{import} \name{reexports} \alias{reexports} \alias{GRangesList} \alias{rel} \title{Objects exported from other packages} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ \item{GenomicRanges}{\code{\link[GenomicRanges:GRangesList-class]{GRangesList}}} \item{ggplot2}{\code{\link[ggplot2:element]{rel}}} }} ================================================ FILE: man/seq2gene.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/seq2gene.R \name{seq2gene} \alias{seq2gene} \title{seq2gene} \usage{ seq2gene(seq, tssRegion, flankDistance, TxDb, sameStrand = FALSE) } \arguments{ \item{seq}{genomic regions in GRanges object} \item{tssRegion}{TSS region} \item{flankDistance}{flanking search radius} \item{TxDb}{TranscriptDb object} \item{sameStrand}{logical whether find nearest/overlap gene in the same strand} } \value{ gene vector } \description{ annotate genomic regions to genes in many-to-many mapping } \details{ This funciton associates genomic regions with coding genes in a many-to-many mapping. It first maps genomic regions to host genes (either located in exon or intron), proximal genes (located in promoter regions) and flanking genes (located in upstream and downstream within user specify distance). } \examples{ \dontrun{ library(TxDb.Hsapiens.UCSC.hg19.knownGene) TxDb <- TxDb.Hsapiens.UCSC.hg19.knownGene file <- getSampleFiles()[[1]] # a bed file gr <- readPeakFile(file) genes <- seq2gene(gr, tssRegion=c(-1000, 1000), flankDistance = 3000, TxDb) } } \author{ Guangchuang Yu } ================================================ FILE: man/show-methods.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/csAnno.R \docType{methods} \name{show} \alias{show} \alias{show,csAnno,ANY-method} \title{show method} \usage{ show(object) } \arguments{ \item{object}{A \code{csAnno} instance} } \value{ message } \description{ show method for \code{csAnno} instance } \author{ Guangchuang Yu \url{https://guangchuangyu.github.io} } ================================================ FILE: man/shuffle.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/enrichOverlap.R \name{shuffle} \alias{shuffle} \title{shuffle} \usage{ shuffle(peak.gr, TxDb) } \arguments{ \item{peak.gr}{GRanges object} \item{TxDb}{TxDb} } \value{ GRanges object } \description{ shuffle the position of peak } \author{ G Yu } ================================================ FILE: man/tagHeatmap.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plotTagMatrix.R \name{tagHeatmap} \alias{tagHeatmap} \title{tagHeatmap} \usage{ tagHeatmap( tagMatrix, xlab = "", ylab = "", title = NULL, palette = "RdBu", nrow = NULL, ncol = NULL ) } \arguments{ \item{tagMatrix}{tagMatrix or a list of tagMatrix} \item{xlab}{xlab} \item{ylab}{ylab} \item{title}{title} \item{palette}{palette to be filled in,details see \link[ggplot2]{scale_colour_brewer}} \item{nrow}{the nrow of plotting a list of peak} \item{ncol}{the ncol of plotting a list of peak} } \value{ figure } \description{ plot the heatmap of tagMatrix } \author{ G Yu } ================================================ FILE: man/upsetplot-methods.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/csAnno.R \docType{methods} \name{upsetplot} \alias{upsetplot} \title{upsetplot method} \usage{ upsetplot(x, ...) } \arguments{ \item{x}{A \code{csAnno} instance} \item{...}{additional parameter} } \value{ plot } \description{ upsetplot method generics } \author{ Guangchuang Yu \url{https://guangchuangyu.github.io} } ================================================ FILE: man/vennpie-methods.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/AllGenerics.R, R/csAnno.R \docType{methods} \name{vennpie} \alias{vennpie} \title{vennpie method generics} \usage{ vennpie(x, r = 0.2, cex = 1.2, ...) vennpie(x, r = 0.2, cex=1.2, ...) } \arguments{ \item{x}{A \code{csAnno} instance} \item{r}{initial radius} \item{cex}{value to adjust legend} \item{...}{additional parameter} } \value{ plot } \description{ vennpie method generics } \author{ Guangchuang Yu \url{https://guangchuangyu.github.io} } ================================================ FILE: man/vennplot.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/vennplot.R \name{vennplot} \alias{vennplot} \title{vennplot} \usage{ vennplot(Sets, by = "gplots", ...) } \arguments{ \item{Sets}{a list of object, can be vector or GRanges object} \item{by}{one of gplots, ggVennDiagram or Vennerable} \item{...}{extra parameters using ggVennDiagram. Details see \link[ggVennDiagram]{ggVennDiagram}} } \value{ venn plot that summarize the overlap of peaks from different experiments or gene annotation from different peak files. } \description{ plot the overlap of a list of object } \details{ There are two ways to plot, which users can specify through `by`. The first way is to use `gplots` packages, by setting `by = gplots`. This method is default method. The venn plot produced through this way has no color. The second way is to use `ggVennDiagram` packages, by setting `by = ggVennDiagram`. The venn plot produced through this way has colors which can be defined by users using ggplot2 grammar e.g.(scale_fill_distiller()). And users can specify any details, like digital number, text size and showing percentage or not, by inputting `...` extra parameters. } \examples{ ## example not run ## require(TxDb.Hsapiens.UCSC.hg19.knownGene) ## txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene ## peakfiles <- getSampleFiles() ## peakAnnoList <- lapply(peakfiles, annotatePeak) ## names(peakAnnoList) <- names(peakfiles) ## genes= lapply(peakAnnoList, function(i) as.data.frame(i)$geneId) ## vennplot(genes) } \author{ G Yu } ================================================ FILE: man/vennplot.peakfile.Rd ================================================ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/vennplot.R \name{vennplot.peakfile} \alias{vennplot.peakfile} \title{vennplot.peakfile} \usage{ vennplot.peakfile(files, labels = NULL) } \arguments{ \item{files}{peak files} \item{labels}{labels for peak files} } \value{ figure } \description{ vennplot for peak files } \author{ G Yu } ================================================ FILE: tests/testthat/test-bed.R ================================================ library(ChIPseeker) context("bed file") test_that("parse bed file", { files <- getSampleFiles() for (i in seq_along(files)) { expect_true(is(readPeakFile(files[[i]]), "GRanges")) } }) ================================================ FILE: tests/testthat/test-getTagMatrix.R ================================================ library(ChIPseeker) library(TxDb.Hsapiens.UCSC.hg19.knownGene) context("test getTagMatrix() and related functions") test_that("getBioRegion function", { # test three kinds of regions derived from getBioRegion() txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene gene_start <- getBioRegion(TxDb = txdb, upstream = 1000, downstream = 1000, by = 'gene', type = "start_site") expect_is(gene_start,"GRanges") gene_end <- getBioRegion(TxDb = txdb, upstream = 1000, downstream = 1000, by = 'gene', type = "end_site") expect_is(gene_end,"GRanges") gene_body <- getBioRegion(TxDb = txdb, upstream = 1000, downstream = 1000, by = 'gene', type = "body") expect_is(gene_body,"GRanges") }) test_that("getPromoters functions",{ # test two kinds of regions derived from getPromoters txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene gene <- getPromoters(TxDb=txdb, upstream=1000, downstream=1000, by = "gene") transcript <- getPromoters(TxDb=txdb, upstream=1000, downstream=1000, by = "transcript") expect_is(gene,"GRanges") expect_is(transcript,"GRanges") }) test_that("makeBioRegionFromGranges function",{ # we consider transcript region as enhancer region # and make self-made granges object # they can be the same in the form of granges object txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene enhancer <- transcripts(txdb)[1:5000,] ## we test three kinds of region, start_site, end_site and body enhancer_body <- makeBioRegionFromGranges(gr = enhancer, by = "enhancer", type = "body") enhancer_start <- makeBioRegionFromGranges(gr = enhancer, by = "enhancer", type = "start_site", upstream = 1000, downstream = 1000) enhancer_end <- makeBioRegionFromGranges(gr = enhancer, by = "enhancer", type = "end_site", upstream = 1000, downstream = 1000) expect_is(enhancer_body,"GRanges") expect_is(enhancer_start,"GRanges") expect_is(enhancer_end,"GRanges") ## test the label expect_equal(attr(enhancer_body,'label'),c("enhancer_SS","enhancer_TS")) expect_equal(attr(enhancer_start,'label'),"enhancer_SS") expect_equal(attr(enhancer_end,'label'),"enhancer_TS") }) test_that("getTagMatrix function for single peak file",{ peak <- getSampleFiles()[[4]] txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene # make the window by getBioRegion() gene_start <- getBioRegion(TxDb = txdb, upstream = 1000, downstream = 1000, by = 'gene', type = "start_site") # make the window by makeBioRegionFromGranges() enhancer <- transcripts(txdb)[1:5000,] enhancer_body <- makeBioRegionFromGranges(gr = enhancer, by = "enhancer", type = "body") # test input window parameter mt1 <- getTagMatrix(peak = peak, windows = gene_start, weightCol = "V5") expect_is(mt1, "matrix") # without extending flank mt2_1 <- getTagMatrix(peak = peak, windows = enhancer_body, weightCol = "V5", nbin = 800) expect_is(mt2_1, "matrix") # extend flank by rel object mt2_2 <- getTagMatrix(peak = peak, windows = enhancer_body, weightCol = "V5", upstream = rel(0.2), downstream = rel(0.2), nbin = 800) expect_is(mt2_2, "matrix") # extend flank by actual number mt2_3 <- getTagMatrix(peak = peak, windows = enhancer_body, weightCol = "V5", upstream = 1000, downstream = 1000, nbin = 800) expect_is(mt2_3, "matrix") # test input without window parameter # make window through txdb object mt3 <- getTagMatrix(peak = peak, weightCol = "V5", TxDb = txdb, by = "gene", type = "start_site", upstream = 3000, downstream = 3000) expect_is(mt3, "matrix") # make window through self-made grange object mt4 <- getTagMatrix(peak = peak, weightCol = "V5", TxDb = enhancer, by = "gene", type = "start_site", upstream = 1000, downstream = 1000) expect_is(mt4, "matrix") # without extending flank mt5_1 <- getTagMatrix(peak = peak, weightCol = "V5", TxDb = txdb, by = "gene", type = "body", nbin = 800) expect_is(mt5_1, "matrix") # extend flank by rel object mt5_2 <- getTagMatrix(peak = peak, TxDb = enhancer, weightCol = "V5", by = "enhancer", type = "body", upstream = rel(0.2), downstream = rel(0.2), nbin = 800) expect_is(mt5_2, "matrix") # extend flank by actual number mt5_3 <- getTagMatrix(peak = peak, TxDb = txdb, weightCol = "V5", by = "gene", type = "body", upstream = 1000, downstream = 1000, nbin = 800) expect_is(mt5_3, "matrix") }) ================================================ FILE: tests/testthat/test-txdb.R ================================================ library(TxDb.Hsapiens.UCSC.hg19.knownGene) library(TxDb.Hsapiens.UCSC.hg38.knownGene) library(ChIPseeker) library(yulab.utils) context("TXDB") test_that("Update txdb", { hg19_txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene ChIPseeker:::.ChIPseekerEnv(hg19_txdb) expect_equal(ChIPseeker:::get_env_genome(), "hg19") hg38_txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene ChIPseeker:::.ChIPseekerEnv(hg38_txdb) expect_equal(ChIPseeker:::get_env_genome(), "hg38") }) test_that("txdb", { txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene ChIPseeker:::.ChIPseekerEnv(txdb) expect_equal(ChIPseeker:::IDType(txdb), "Entrez Gene ID") if (packageVersion("TxDb.Hsapiens.UCSC.hg19.knownGene") > "3.22") { expect_equal( ChIPseeker:::TXID2EG("70455"), "ENST00000487630.1_3/ENST00000487630.1_3" ) expect_equal( ChIPseeker:::TXID2EG("70455", geneIdOnly = TRUE), "ENST00000487630.1_3" ) } else { expect_equal(ChIPseeker:::TXID2EG("70455"), "uc002qsd.4/1") expect_equal(ChIPseeker:::TXID2EG("70455", geneIdOnly = TRUE), "1") } }) ================================================ FILE: tests/testthat.R ================================================ library(testthat) library(ChIPseeker) test_check("ChIPseeker") ================================================ FILE: vignettes/ChIPseeker.Rmd ================================================ --- title: "ChIPseeker: an R package for ChIP peak Annotation, Comparison and Visualization" author: "Guangchuang Yu\\ School of Basic Medical Sciences, Southern Medical University" date: "`r Sys.Date()`" bibliography: ChIPseeker.bib biblio-style: apalike output: prettydoc::html_pretty: toc: true theme: cayman highlight: github pdf_document: toc: true vignette: > %\VignetteIndexEntry{ChIPseeker: an R package for ChIP peak Annotation, Comparison and Visualization} %\VignetteEngine{knitr::rmarkdown} %\usepackage[utf8]{inputenc} %\VignetteEncoding{UTF-8} --- ```{r style, echo=FALSE, results='asis', message=FALSE} knitr::opts_chunk$set(tidy = FALSE, warning = FALSE, message = FALSE) library(yulab.utils) Biocannopkg <- yulab.utils::Biocpkg ``` ```{r echo=FALSE, results='hide', message=FALSE} library(GenomicFeatures) library(GenomicRanges) library(TxDb.Hsapiens.UCSC.hg19.knownGene) library(org.Hs.eg.db) library(ggplot2) library(clusterProfiler) library(ReactomePA) library(ChIPseeker) ``` # Abstract ChIPseeker is an R package for annotating ChIP-seq data analysis. It supports annotating ChIP peaks and provides functions to visualize ChIP peaks coverage over chromosomes and profiles of peaks binding to TSS regions. Comparison of ChIP peak profiles and annotation are also supported. Moreover, it supports evaluating significant overlap among ChIP-seq datasets. Currently, ChIPseeker contains 17,000 bed file information from GEO database. These datasets can be downloaded and compare with user's own data to explore significant overlap datasets for inferring co-regulation or transcription factor complex for further investigation. # Citation If you use `r Biocpkg("ChIPseeker")`[@yu_chipseeker_2015] in published research, please cite: + Q Wang#, M Li#, T Wu, L Zhan, L Li, M Chen, W Xie, Z Xie, E Hu, S Xu, __G Yu__\*. [Exploring epigenomic datasets by ChIPseeker](https://onlinelibrary.wiley.com/share/author/GYJGUBYCTRMYJFN2JFZZ?target=10.1002/cpz1.585). __*Current Protocols*__, 2022, 2(10): e585. + __G Yu__\*, LG Wang, QY He\*. [ChIPseeker: an R/Bioconductor package for ChIP peak annotation, comparision and visualization](http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btv145). __*Bioinformatics*__. 2015, 31(14):2382-2383. # Introduction Chromatin immunoprecipitation followed by high-throughput sequencing (ChIP-seq) has become standard technologies for genome wide identification of DNA-binding protein target sites. After read mappings and peak callings, the peak should be annotated to answer the biological questions. Annotation also create the possibility of integrating expression profile data to predict gene expression regulation. `r Biocpkg("ChIPseeker")`[@yu_chipseeker_2015] was developed for annotating nearest genes and genomic features to peaks. ChIP peak data set comparison is also very important. We can use it as an index to estimate how well biological replications are. Even more important is applying to infer cooperative regulation. If two ChIP seq data, obtained by two different binding proteins, overlap significantly, these two proteins may form a complex or have interaction in regulation chromosome remodelling or gene expression. `r Biocpkg("ChIPseeker")`[@yu_chipseeker_2015] support statistical testing of significant overlap among ChIP seq data sets, and incorporate open access database GEO for users to compare their own dataset to those deposited in database. Protein interaction hypothesis can be generated by mining data deposited in database. Converting genome coordinations from one genome version to another is also supported, making this comparison available for different genome version and different species. Several visualization functions are implemented to visualize the coverage of the ChIP seq data, peak annotation, average profile and heatmap of peaks binding to TSS region. Functional enrichment analysis of the peaks can be performed by my Bioconductor packages `r Biocpkg("DOSE")`[@yu_dose_2015], `r Biocpkg("ReactomePA")`[@yu_reactomepa_2016], `r Biocpkg("clusterProfiler")`[@yu_clusterprofiler_2012]. ```{r} ## loading packages library(ChIPseeker) library(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene library(clusterProfiler) ``` # ChIP profiling The datasets _CBX6_ and _CBX7_ in this vignettes were downloaded from _GEO (GSE40740)_[@pemberton_genome-wide_2014] while _ARmo\_0M_, _ARmo\_1nM_ and _ARmo\_100nM_ were downloaded from _GEO (GSE48308)_[@urbanucci_overexpression_2012] . `r Biocpkg("ChIPseeker")` provides `readPeakFile` to load the peak and store in `GRanges` object. ```{r} files <- getSampleFiles() print(files) peak <- readPeakFile(files[[4]]) peak ``` ## ChIP peaks coverage plot After peak calling, we would like to know the peak locations over the whole genome, `covplot` function calculates the coverage of peak regions over chromosomes and generate a figure to visualize. [GRangesList](https://guangchuangyu.github.io/2016/02/covplot-supports-grangeslist) is also supported and can be used to compare coverage of multiple bed files. ```{r fig.height=8, fig.width=10} covplot(peak, weightCol="V5") ``` ```{r fig.height=4, fig.width=10} covplot(peak, weightCol="V5", chrs=c("chr17", "chr18"), xlim=c(4.5e7, 5e7)) ``` When `peak` is a `GRangsList` object, user can set the colors directly or by passing a palette to `fill_color`. ```{r fig.height=8, fig.width=10} peaks = lapply(files[4:5], readPeakFile) covplot(peaks, weightCol = "V5", fill_color = c("red","blue")) + theme(legend.position = "inside", legend.position.inside = c(0.8,0.2)) ``` ## Profile of ChIP peaks binding to TSS regions First of all, for calculating the profile of ChIP peaks binding to TSS regions, we should prepare the TSS regions, which are defined as the flanking sequence of the TSS sites. Then align the peaks that are mapping to these regions, and generate the tagMatrix. ```{r} ## promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000) ## tagMatrix <- getTagMatrix(peak, windows=promoter) ## ## to speed up the compilation of this vignettes, we use a precalculated tagMatrix data("tagMatrixList") tagMatrix <- tagMatrixList[[4]] ``` In the above code, you should notice that tagMatrix is not restricted to TSS regions. The regions can be other types that defined by the user. `r Biocpkg("ChIPseeker")` expanded the scope of region. Users can input the `type` and `by` parameters to get the regions they want. ### Heatmap of ChIP binding to TSS regions ```{r fig.cap="Heatmap of ChIP peaks binding to TSS regions", fig.align="center", fig.height=9, fig.width=6} tagHeatmap(tagMatrix) ``` `r Biocpkg("ChIPseeker")` provide a one step function to generate this figure from bed file. The following function will generate the same figure as above. ```{r eval=FALSE} peakHeatmap(files[[4]], TxDb=txdb, upstream=3000, downstream=3000) ``` Users can use `nbin` parameter to speed up. ```{r eval=FALSE} peakHeatmap(files[[4]],TxDb = txdb,nbin = 800,upstream=3000, downstream=3000) ``` Users can also use ggplot method to change the details of the figures. ```{r eval=FALSE} peakHeatmap(files[[4]],TxDb = txdb,nbin = 800,upstream=3000, downstream=3000) + scale_fill_distiller(palette = "RdYlGn") ``` Users can also profile genebody regions with `peakHeatmap()`. ```{r fig.cap="Heatmap of genebody regions", fig.align="center", fig.height=9, fig.width=6,results='hide'} peakHeatmap(peak = files[[4]], TxDb = txdb, upstream = rel(0.2), downstream = rel(0.2), by = "gene", type = "body", nbin = 800) ``` Sometimes there will be a need to explore the comparison of the peak heatmap over two regions, for example, the following picture is the peak over two gene sets. One possible scenery of using this method is to compare the peak heatmap over up-regulating genes and down-regulating genes. Here `txdb1` and `txdb2` is the simulated gene sets obtain from `TxDb.Hsapiens.UCSC.hg19.knownGene`. Using `peakHeatmap_multiple_Sets()`, accepting `list` object containing different regions information. The length of each part is correlated to the amount of regions. ```{r fig.cap="Heatmap of over two regions", fig.align="center", fig.height=9, fig.width=6,results='hide'} txdb1 <- transcripts(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb2 <- unlist(fiveUTRsByTranscript(TxDb.Hsapiens.UCSC.hg19.knownGene))[1:10000,] region_list <- list(geneX = txdb1, geneY = txdb2) peakHeatmap_multiple_Sets(peak = files[[4]], upstream = 1000,downstream = 1000, by = c("geneX","geneY"), type = "start_site", TxDb = region_list,nbin = 800) ``` We also meet the need of ploting heatmap and peak profiling together. ```{r fig.cap="Combination of heatmap and peak profiling", fig.align="center", fig.height=9, fig.width=6,results='hide'} peak_Profile_Heatmap(peak = files[[4]], upstream = 1000, downstream = 1000, by = "gene", type = "start_site", TxDb = txdb, nbin = 800) ``` Exploring several regions with heatmap and peak profiling is also supported. ```{r fig.cap="Combination of heatmap and peak profiling over several regions", fig.align="center", fig.height=12, fig.width=6,results='hide'} txdb1 <- transcripts(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb2 <- unlist(fiveUTRsByTranscript(TxDb.Hsapiens.UCSC.hg19.knownGene))[1:10000,] region_list <- list(geneX = txdb1, geneY = txdb2) peak_Profile_Heatmap(peak = files[[4]], upstream = 1000, downstream = 1000, by = c("geneX","geneY"), type = "start_site", TxDb = region_list,nbin = 800) ``` ### Average Profile of ChIP peaks binding to TSS region ```{r eval=TRUE, fig.cap="Average Profile of ChIP peaks binding to TSS region", fig.align="center", fig.height=4, fig.width=7} plotAvgProf(tagMatrix, xlim=c(-3000, 3000), xlab="Genomic Region (5'->3')", ylab = "Read Count Frequency") ``` The function `plotAvgProf2` provide a one step from bed file to average profile plot. The following command will generate the same figure as shown above. ```{r eval=FALSE} plotAvgProf2(files[[4]], TxDb=txdb, upstream=3000, downstream=3000, xlab="Genomic Region (5'->3')", ylab = "Read Count Frequency") ``` Confidence interval estimated by bootstrap method is also supported for characterizing ChIP binding profiles. ```{r fig.cap="Average Profile of ChIP peaks binding to TSS region", fig.align="center", fig.height=4, fig.width=7, eval=F} plotAvgProf(tagMatrix, xlim=c(-3000, 3000), conf = 0.95, resample = 1000) ``` ![](figures/plotAvgProf_boot.png) ## Profile of ChIP peaks binding to different regions Referring to the [issue #16](https://github.com/GuangchuangYu/ChIPseeker/issues/16) , we developed and improved several functions support start site region, end site region and body region of Gene/Transcript/Exon/Intron/3UTR/5UTR. `getBioRegion` can prepare the different regions for ChIP peaks to bind. `getTagMatrix` can accept `type`, `by`, `upstream` and `downstream` parameters to get tagmatrix according to different needs. `plotPeakProf` and `plotPeakProf2` supports the plotting of profiles of peaks binding to different regions.Users can also create heatmap or average profile of ChIP peaks binding to these regions. In order to plot body regions, a new methond `binning`,was introduced to `getTagMatrix`. The idea of `binning` was derived from [deeptools](https://deeptools.readthedocs.io/en/develop/content/tools/computeMatrix.html)[@ramirez2016deeptools2]. `binning` scaled the regions having different lengths to the equal length by deviding the regions into the same amounts of boxes. Because the amount of boxes is equal, the regions can be thought of scaling to equal length.`binning` method can speed up the `getTagMatrix` by changing the precision from bp to box(several bps). There are three ways to plot these regions. First, users can use `getBioRegion` to prepare the regions. Then align the peaks that are mapping to these regions, and generate the tagMatrix by `getTagMatrix`. At Last, plot the figures by `plotPeakProf`. Second, users can input `type` and `by` parameters to `getTagMatrix` to get the tagMatrix and plot the figures. Third, users can use `plotPeakProf2` to do everything in one step. ### Binning method for profile of ChIP peaks binding to TSS regions Here uses the method of inputting `type` and `by` parameters. `type = "start_site"` means the start site region. `by = "gene"` means that it is the start site region of gene(TSS regions). If users want to use binning method, the `nbin` method must be set. ```{r eval=F} ## The results of binning method and normal method are nearly the same. tagMatrix_binning <- getTagMatrix(peak = peak, TxDb = txdb, upstream = 3000, downstream = 3000, type = "start_site", by = "gene", weightCol = "V5", nbin = 800) ``` ### Profile of ChIP peaks binding to body regions We improved and developed several functions to plot body region of Gene/Transcript/Exon/Intron/3UTR/5UTR. If users want to get more information from the body region, we added `upstream` and `downstream` parameters to functions in order to get flank extension of body regions. `upstream` and `downstream` can be NULL(default), rel object and actual numbers. NULL(default) reflects body regions with no flank extension. Rel object reflects the percentage of total length of body regions. Actual numbers reflects the actual length of flank extension. ```{r eval=F} ## Here uses `plotPeakProf2` to do all things in one step. ## Gene body regions having lengths smaller than nbin will be filtered ## A message will be given to warning users about that. ## >> 9 peaks(0.872093%), having lengths smaller than 800bp, are filtered... ## the ignore_strand is FALSE in default. We put here to emphasize that. ## We will not show it again in the below example plotPeakProf2(peak = peak, upstream = rel(0.2), downstream = rel(0.2), conf = 0.95, by = "gene", type = "body", nbin = 800, TxDb = txdb, weightCol = "V5",ignore_strand = F) ``` ![](figures/plotPeakProf2_body_extend.png) Users can also get the profile ChIP peaks binding to gene body regions with no flank extension or flank extension decided by actual length. ```{r eval=F} ## The first method using getBioRegion(), getTagMatrix() and plotPeakProf() to plot in three steps. genebody <- getBioRegion(TxDb = txdb, by = "gene", type = "body") matrix_no_flankextension <- getTagMatrix(peak,windows = genebody, nbin = 800) plotPeakProf(matrix_no_flankextension,conf = 0.95) ## The second method of using getTagMatrix() and plotPeakProf() to plot in two steps matrix_actual_extension <- getTagMatrix(peak,windows = genebody, nbin = 800, upstream = 1000,downstream = 1000) plotPeakProf(matrix_actual_extension,conf = 0.95) ``` Users can also get the body region of 5UTR/3UTR. ```{r eval=F} five_UTR_body <- getTagMatrix(peak = peak, TxDb = txdb, upstream = rel(0.2), downstream = rel(0.2), type = "body", by = "5UTR", weightCol = "V5", nbin = 50) plotPeakProf(tagMatrix = five_UTR_body, conf = 0.95) ``` ### Profile of ChIP peaks binding to TTS regions ```{r eval=F} TTS_matrix <- getTagMatrix(peak = peak, TxDb = txdb, upstream = 3000, downstream = 3000, type = "end_site", by = "gene", weightCol = "V5") plotPeakProf(tagMatrix = TTS_matrix, conf = 0.95) ``` # Peak Annotation ```{r} peakAnno <- annotatePeak(files[[4]], tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Hs.eg.db") ``` Note that it would also be possible to use Ensembl-based `EnsDb` annotation databases created by the `r Biocpkg("ensembldb")` package for the peak annotations by providing it with the `TxDb` parameter. Since UCSC-style chromosome names are used we have to change the style of the chromosome names from *Ensembl* to *UCSC* in the example below. ```{r, eval = FALSE} library(EnsDb.Hsapiens.v75) edb <- EnsDb.Hsapiens.v75 seqlevelsStyle(edb) <- "UCSC" peakAnno.edb <- annotatePeak(files[[4]], tssRegion=c(-3000, 3000), TxDb=edb, annoDb="org.Hs.eg.db") ``` Peak Annotation is performed by `annotatePeak`. User can define TSS (transcription start site) region, by default TSS is defined from -3kb to +3kb. The output of `annotatePeak` is `csAnno` instance. `r Biocpkg("ChIPseeker")` provides `as.GRanges` to convert `csAnno` to `GRanges` instance, and `as.data.frame` to convert `csAnno` to `data.frame` which can be exported to file by `write.table`. `TxDb` object contained transcript-related features of a particular genome. Bioconductor provides several package that containing `TxDb` object of model organisms with multiple commonly used genome version, for instance `r Biocannopkg("TxDb.Hsapiens.UCSC.hg38.knownGene")`, `r Biocannopkg("TxDb.Hsapiens.UCSC.hg19.knownGene")` for human genome hg38 and hg19, `r Biocannopkg("TxDb.Mmusculus.UCSC.mm10.knownGene")` and `r Biocannopkg("TxDb.Mmusculus.UCSC.mm9.knownGene")` for mouse genome mm10 and mm9, etc. User can also prepare their own `TxDb` object by retrieving information from UCSC Genome Bioinformatics and BioMart data resources by R function `makeTxDbFromBiomart` and `makeTxDbFromUCSC`. `TxDb` object should be passed for peak annotation. All the peak information contained in peakfile will be retained in the output of `annotatePeak`. The position and strand information of nearest genes are reported. The distance from peak to the TSS of its nearest gene is also reported. The genomic region of the peak is reported in annotation column. Since some annotation may overlap, `r Biocpkg("ChIPseeker")` adopted the following priority in genomic annotation. * Promoter * 5' UTR * 3' UTR * Exon * Intron * Downstream * Intergenic _Downstream_ is defined as the downstream of gene end. `r Biocpkg("ChIPseeker")` also provides parameter _genomicAnnotationPriority_ for user to prioritize this hierachy. `annotatePeak` report detail information when the annotation is Exon or Intron, for instance "Exon (uc002sbe.3/9736, exon 69 of 80)", means that the peak is overlap with an Exon of transcript uc002sbe.3, and the corresponding Entrez gene ID is 9736 (Transcripts that belong to the same gene ID may differ in splice events), and this overlaped exon is the 69th exon of the 80 exons that this transcript uc002sbe.3 prossess. Parameter annoDb is optional, if provided, extra columns including SYMBOL, GENENAME, ENSEMBL/ENTREZID will be added. The geneId column in annotation output will be consistent with the geneID in TxDb. If it is ENTREZID, ENSEMBL will be added if annoDb is provided, while if it is ENSEMBL ID, ENTREZID will be added. ## Visualize Genomic Annotation To annotate the location of a given peak in terms of genomic features, `annotatePeak` assigns peaks to genomic annotation in "annotation" column of the output, which includes whether a peak is in the TSS, Exon, 5' UTR, 3' UTR, Intronic or Intergenic. Many researchers are very interesting in these annotations. TSS region can be defined by user and `annotatePeak` output in details of which exon/intron of which genes as illustrated in previous section. Pie and Bar plot are supported to visualize the genomic annotation. ```{r fig.cap="Genomic Annotation by pieplot", fig.align="center", fig.height=6, fig.width=8} plotAnnoPie(peakAnno) ``` ```{r fig.cap="Genomic Annotation by barplot", fig.align="center", fig.height=4, fig.width=10} plotAnnoBar(peakAnno) ``` Since some annotation overlap, user may interested to view the full annotation with their overlap, which can be partially resolved by `vennpie` function. ```{r fig.cap="Genomic Annotation by vennpie", fig.align="center", fig.height=8, fig.width=11} vennpie(peakAnno) ``` We extend `r CRANpkg("UpSetR")` to view full annotation overlap. User can user `upsetplot` function. ```{r eval=F, fig.cap="Genomic Annotation by upsetplot", fig.align="center", fig.height=8, fig.width=12} upsetplot(peakAnno) ``` ![](figures/upset.png) We can combine `vennpie` with `upsetplot` by setting *vennpie = TRUE*. ```{r eval=F, fig.cap="Genomic Annotation by upsetplot", fig.align="center", fig.height=8, fig.width=12} upsetplot(peakAnno, vennpie=TRUE) ``` ![](figures/upset_vennpie.png) ## Visualize distribution of TF-binding loci relative to TSS The distance from the peak (binding site) to the TSS of the nearest gene is calculated by `annotatePeak` and reported in the output. We provide `plotDistToTSS` to calculate the percentage of binding sites upstream and downstream from the TSS of the nearest genes, and visualize the distribution. ```{r fig.cap="Distribution of Binding Sites", fig.align="center", fig.height=2, fig.width=6} plotDistToTSS(peakAnno, title="Distribution of transcription factor-binding loci\nrelative to TSS") ``` # Functional enrichment analysis Once we have obtained the annotated nearest genes, we can perform functional enrichment analysis to identify predominant biological themes among these genes by incorporating biological knowledge provided by biological ontologies. For instance, Gene Ontology (GO)[@ashburner_gene_2000] annotates genes to biological processes, molecular functions, and cellular components in a directed acyclic graph structure, Kyoto Encyclopedia of Genes and Genomes (KEGG)[@kanehisa_kegg_2004] annotates genes to pathways, Disease Ontology (DO)[@schriml_disease_2011] annotates genes with human disease association, and Reactome[@croft_reactome_2013] annotates gene to pathways and reactions. `r Biocpkg("ChIPseeker")` also provides a function, __*seq2gene*__, for linking genomc regions to genes in a many-to-many mapping. It consider host gene (exon/intron), promoter region and flanking gene from intergenic region that may under control via cis-regulation. This function is designed to link both coding and non-coding genomic regions to coding genes and facilitate functional analysis. Enrichment analysis is a widely used approach to identify biological themes. I have developed several Bioconductor packages for investigating whether the number of selected genes associated with a particular biological term is larger than expected, including `r Biocpkg("DOSE")`[@yu_dose_2015] for Disease Ontology, `r Biocpkg("ReactomePA")` for reactome pathway, `r Biocpkg("clusterProfiler")`[@yu_clusterprofiler_2012] for Gene Ontology and KEGG enrichment analysis. ```{r fig.width=8, fig.height=5} library(ReactomePA) pathway1 <- enrichPathway(as.data.frame(peakAnno)$geneId) head(pathway1, 2) gene <- seq2gene(peak, tssRegion = c(-1000, 1000), flankDistance = 3000, TxDb=txdb) pathway2 <- enrichPathway(gene) head(pathway2, 2) dotplot(pathway2) ``` More information can be found in the vignettes of Bioconductor packages `r Biocpkg("DOSE")`[@yu_dose_2015], `r Biocpkg("ReactomePA")`, `r Biocpkg("clusterProfiler")`[@yu_clusterprofiler_2012], which also provide several methods to visualize enrichment results. The `r Biocpkg("clusterProfiler")`[@yu_clusterprofiler_2012] is designed for comparing and visualizing functional profiles among gene clusters, and can directly applied to compare biological themes at GO, DO, KEGG, Reactome perspective. # ChIP peak data set comparison ## Profile of several ChIP peak data binding to TSS region Function `plotAvgProf`, `tagHeatmap` and `plotPeakProf` can accept a list of `tagMatrix` and visualize profile or heatmap among several ChIP experiments, while `plotAvgProf2` , `peakHeatmap` and `plotPeakProf2` can accept a list of bed files and perform the same task in one step. ### Average profiles ```{r eval=TRUE, fig.cap="Average Profiles of ChIP peaks among different experiments", fig.align="center", fig.height=4, fig.width=6} ## promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000) ## tagMatrixList <- lapply(files, getTagMatrix, windows=promoter) ## ## to speed up the compilation of this vigenette, we load a precaculated tagMatrixList data("tagMatrixList") plotAvgProf(tagMatrixList, xlim=c(-3000, 3000)) ``` ```{r eval=FALSE, fig.cap="Average Profiles of ChIP peaks among different experiments", fig.align="center", fig.height=7, fig.width=6} plotAvgProf(tagMatrixList, xlim=c(-3000, 3000), conf=0.95,resample=500, facet="row") ``` ![](figures/plotAvgProf_boot_list.png) ```{r eval=F} ## normal method plotPeakProf2(files, upstream = 3000, downstream = 3000, conf = 0.95, by = "gene", type = "start_site", TxDb = txdb, facet = "row") ## binning method plotPeakProf2(files, upstream = 3000, downstream = 3000, conf = 0.95, by = "gene", type = "start_site", TxDb = txdb, facet = "row", nbin = 800) ``` ### Peak heatmaps ```{r eval=TRUE, fig.cap="Heatmap of ChIP peaks among different experiments", fig.align="center", fig.height=8, fig.width=16} tagHeatmap(tagMatrixList) ``` ## Profile of several ChIP peak data binding to body region Functions `plotPeakProf` and `plotPeakProf2` also support to plot profile of several ChIP peak data binding to body region. ```{r eval=F} plotPeakProf2(files, upstream = rel(0.2), downstream = rel(0.2), conf = 0.95, by = "gene", type = "body", TxDb = txdb, facet = "row", nbin = 800) ``` ![](figures/plotPeakProf_body_boot_list_.png) ## ChIP peak annotation comparision The `plotAnnoBar` and `plotDistToTSS` can also accept input of a named list of annotated peaks (output of `annotatePeak`). ```{r} peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE) ``` We can use `plotAnnoBar` to comparing their genomic annotation. ```{r fig.cap="Genomic Annotation among different ChIPseq data", fig.align="center", fig.height=4, fig.width=6} plotAnnoBar(peakAnnoList) ``` R function `plotDistToTSS` can use to comparing distance to TSS profiles among ChIPseq data. ```{r fig.cap="Distribution of Binding Sites among different ChIPseq data", fig.align="center", fig.height=5, fig.width=8} plotDistToTSS(peakAnnoList) ``` ## Functional profiles comparison As shown in section 4, the annotated genes can analyzed by `r Biocpkg("clusterProfiler")`[@yu_clusterprofiler_2012], `r Biocpkg("DOSE")`[@yu_dose_2015], `r Biocpkg("meshes")` and `r Biocpkg("ReactomePA")` for Gene Ontology, KEGG, Disease Ontology, MeSH and Reactome Pathway enrichment analysis. The `r Biocpkg("clusterProfiler")`[@yu_clusterprofiler_2012] package provides `compareCluster` function for comparing biological themes among gene clusters, and can be easily adopted to compare different ChIP peak experiments. ```{r fig.width=8.5, fig.height=8.5} genes = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId) names(genes) = sub("_", "\n", names(genes)) compKEGG <- compareCluster(geneCluster = genes, fun = "enrichKEGG", pvalueCutoff = 0.05, pAdjustMethod = "BH") dotplot(compKEGG, showCategory = 15, title = "KEGG Pathway Enrichment Analysis") ``` ## Overlap of peaks and annotated genes User may want to compare the overlap peaks of replicate experiments or from different experiments. `r Biocpkg("ChIPseeker")` provides `peak2GRanges` that can read peak file and stored in GRanges object. Several files can be read simultaneously using lapply, and then passed to `vennplot` to calculate their overlap and draw venn plot. `vennplot` accept a list of object, can be a list of GRanges or a list of vector. Here, I will demonstrate using `vennplot` to visualize the overlap of the nearest genes stored in peakAnnoList. ```{r fig.cap="Overlap of annotated genes", fig.align="center", fig.height=7, fig.width=7} genes= lapply(peakAnnoList, function(i) as.data.frame(i)$geneId) vennplot(genes) ``` # Statistical testing of ChIP seq overlap Overlap is very important, if two ChIP experiment by two different proteins overlap in a large fraction of their peaks, they may cooperative in regulation. Calculating the overlap is only touch the surface. `r Biocpkg("ChIPseeker")` implemented statistical methods to measure the significance of the overlap. ## Shuffle genome coordination ```{r} p <- GRanges(seqnames=c("chr1", "chr3"), ranges=IRanges(start=c(1, 100), end=c(50, 130))) shuffle(p, TxDb=txdb) ``` We implement the `shuffle` function to randomly permute the genomic locations of ChIP peaks defined in a genome which stored in `TxDb` object. ## Peak overlap enrichment analysis With the ease of this `shuffle` method, we can generate thousands of random ChIP data and calculate the background null distribution of the overlap among ChIP data sets. ```{r} enrichPeakOverlap(queryPeak = files[[5]], targetPeak = unlist(files[1:4]), TxDb = txdb, pAdjustMethod = "BH", nShuffle = 50, chainFile = NULL, verbose = FALSE) ``` Parameter _queryPeak_ is the query ChIP data, while _targetPeak_ is bed file name or a vector of bed file names from comparison; _nShuffle_ is the number to shuffle the peaks in _targetPeak_. To speed up the compilation of this vignettes, we only set _nShuffle_ to 50 as an example for only demonstration. User should set the number to 1000 or above for more robust result. Parameter _chainFile_ are chain file name for mapping the _targetPeak_ to the genome version consistent with _queryPeak_ when their genome version are different. This creat the possibility of comparison among different genome version and cross species. In the output, _qSample_ is the name of _queryPeak_ and _qLen_ is the the number of peaks in _queryPeak_. _N\_OL_ is the number of overlap between _queryPeak_ and _targetPeak_. # Data Mining with ChIP seq data deposited in GEO There are many ChIP seq data sets that have been published and deposited in GEO database. We can compare our own dataset to those deposited in GEO to search for significant overlap data. Significant overlap of ChIP seq data by different binding proteins may be used to infer cooperative regulation and thus can be used to generate hypotheses. We collect about **17,000** bed files deposited in GEO, user can use `getGEOspecies` to get a summary based on speices. ## GEO data collection ```{r} getGEOspecies() ``` The summary can also based on genome version as illustrated below: ```{r} getGEOgenomeVersion() ``` User can access the detail information by `getGEOInfo`, for each genome version. ```{r} hg19 <- getGEOInfo(genome="hg19", simplify=TRUE) head(hg19) ``` If _simplify_ is set to _FALSE_, extra information including _source\_name_, _extract\_protocol_, _description_, _data\_processing_ and _submission\_date_ will be incorporated. ## Download GEO ChIP data sets `r Biocpkg("ChIPseeker")` provide function `downloadGEObedFiles` to download all the bed files of a particular genome. ```{r eval=FALSE} downloadGEObedFiles(genome="hg19", destDir="hg19") ``` Or a vector of GSM accession number by `downloadGSMbedFiles`. ```{r eval=FALSE} gsm <- hg19$gsm[sample(nrow(hg19), 10)] downloadGSMbedFiles(gsm, destDir="hg19") ``` ## Overlap significant testing After download the bed files from GEO, we can pass them to `enrichPeakOverlap` for testing the significant of overlap. Parameter _targetPeak_ can be the folder, _e.g._ hg19, that containing bed files. `enrichPeakOverlap` will parse the folder and compare all the bed files. It is possible to test the overlap with bed files that are mapping to different genome or different genome versions, `enrichPeakOverlap` provide a parameter _chainFile_ that can pass a chain file and liftOver the _targetPeak_ to the genome version consistent with _queryPeak_. Signifcant overlap can be use to generate hypothesis of cooperative regulation.By mining the data deposited in GEO, we can identify some putative complex or interacted regulators in gene expression regulation or chromsome remodelling for further validation. # Need helps? If you have questions/issues, please visit [ChIPseeker homepage](https://guangchuangyu.github.io/software/ChIPseeker/) first. Your problems are mostly documented. If you think you found a bug, please follow [the guide](https://guangchuangyu.github.io/2016/07/how-to-bug-author/) and provide a reproducible example to be posted on [github issue tracker](https://github.com/GuangchuangYu/ChIPseeker/issues). For questions, please post to [Bioconductor support site](https://support.bioconductor.org/) and tag your post with *ChIPseeker*. For Chinese user, you can follow me on [WeChat (微信)](https://guangchuangyu.github.io/blog_images/biobabble.jpg). # Session Information Here is the output of `sessionInfo()` on the system on which this document was compiled: ```{r echo=FALSE} sessionInfo() ``` # References ================================================ FILE: vignettes/ChIPseeker.bib ================================================ @article{yu_reactomepa_2016, title = {{ReactomePA}: an R/Bioconductor package for reactome pathway analysis and visualization}, volume = {12}, issn = {1742-2051}, url = {http://pubs.rsc.org.eproxy2.lib.hku.hk/en/content/articlelanding/2016/mb/c5mb00663e}, doi = {10.1039/C5MB00663E}, shorttitle = {{ReactomePA}}, abstract = {Reactome is a manually curated pathway annotation database for unveiling high-order biological pathways from high-throughput data. {ReactomePA} is an R/Bioconductor package providing enrichment analyses, including hypergeometric test and gene set enrichment analyses. A functional analysis can be applied to the genomic coordination obtained from a sequencing experiment to analyze the functional significance of genomic loci including cis-regulatory elements and non-coding regions. Comparison among different experiments is also supported. Moreover, {ReactomePA} provides several visualization functions to produce highly customizable, publication-quality figures. The source code and documents of {ReactomePA} are freely available through Bioconductor (http://www.bioconductor.org/packages/{ReactomePA}).}, pages = {477--479}, number = {2}, journaltitle = {Molecular {BioSystems}}, shortjournal = {Mol. {BioSyst}.}, author = {Yu, Guangchuang and He, Qing-Yu}, urldate = {2016-02-17}, date = {2016-01-26}, langid = {english} } @article{yu_chipseeker_2015, title = "ChIPseeker: an R/Bioconductor package for ChIP peak annotation, comparison and visualization", author = {Yu, Guangchuang and Wang, Li-Gen and He, Qing-Yu}, journal = "Bioinformatics", year = "2015", volume = "31", number = "14", pages = "2382-2383", PMID = "25765347", url = {http://bioinformatics.oxfordjournals.org/content/31/14/2382.abstract}, doi = "10.1093/bioinformatics/btv145", } @article{yu_dose_2015, title = {{DOSE}: an R/Bioconductor package for disease ontology semantic and enrichment analysis}, volume = {31}, issn = {1367-4803, 1460-2059}, url = {http://bioinformatics.oxfordjournals.org/content/31/4/608}, doi = {10.1093/bioinformatics/btu684}, shorttitle = {{DOSE}}, abstract = {Summary: Disease ontology ({DO}) annotates human genes in the context of disease. {DO} is important annotation in translating molecular findings from high-throughput data to clinical relevance. {DOSE} is an R package providing semantic similarity computations among {DO} terms and genes which allows biologists to explore the similarities of diseases and of gene functions in disease perspective. Enrichment analyses including hypergeometric model and gene set enrichment analysis are also implemented to support discovering disease associations of high-throughput biological data. This allows biologists to verify disease relevance in a biological experiment and identify unexpected disease associations. Comparison among gene clusters is also supported. Availability and implementation: {DOSE} is released under Artistic-2.0 License. The source code and documents are freely available through Bioconductor (http://www.bioconductor.org/packages/release/bioc/html/{DOSE}.html). Supplementary information: Supplementary data are available at Bioinformatics online. Contact: gcyu@connect.hku.hk or tqyhe@jnu.edu.cn}, pages = {608--609}, number = {4}, journaltitle = {Bioinformatics}, shortjournal = {Bioinformatics}, author = {Yu, Guangchuang and Wang, Li-Gen and Yan, Guang-Rong and He, Qing-Yu}, urldate = {2015-02-13}, date = {2015-02-15}, langid = {english} } @article{urbanucci_overexpression_2012, title = {Overexpression of androgen receptor enhances the binding of the receptor to the chromatin in prostate cancer}, volume = {31}, issn = {1476-5594}, doi = {10.1038/onc.2011.401}, abstract = {Androgen receptor ({AR)} is overexpressed in the majority of castration-resistant prostate cancers ({CRPCs).} Our goal was to study the effect of {AR} overexpression on the chromatin binding of the receptor and to identify {AR} target genes that may be important in the emergence of {CRPC.} We have established two sublines of {LNCaP} prostate cancer ({PC)} cell line, one overexpressing {AR} 2-3-fold and the other 4-5-fold compared with the control cells. We used chromatin immunoprecipitation ({ChIP)} and deep-sequencing (seq) to identify {AR-binding} sites ({ARBSs).} We found that the number of {ARBSs} and the {AR-binding} strength were positively associated with the level of {AR} when cells were stimulated with low concentrations of androgens. In cells overexpressing {AR}, the chromatin binding of the receptor took place in 100-fold lower concentration of the ligand than in control cells. We confirmed the association of {AR} level and chromatin binding in two {PC} xenografts, one containing {AR} gene amplification with high {AR} expression, and the other with low expression. By combining the {ChIP-seq} and expression profiling, we identified {AR} target genes that are upregulated in {PC.} Of them, the expression of {ZWINT}, {SKP2} (S-phase kinase-associated protein 2 (p45)) and {FEN1} (flap structure-specific endonuclease 1) was demonstrated to be increased in {CRPC}, while the expression of {SNAI2} was decreased in both {PC} and {CRPC.} {FEN1} protein expression was also associated with poor prognosis in prostatectomy-treated patients. Finally, the knock-down of {FEN1} with small interfering {RNA} inhibited the growth of {LNCaP} cells. Our data demonstrate that the overexpression of {AR} sensitizes the receptor binding to chromatin, thus, explaining how {AR} signaling pathway is reactivated in {CRPC} cells.}, pages = {2153-2163}, number = {17}, journaltitle = {Oncogene}, shortjournal = {Oncogene}, author = {Urbanucci, A and Sahu, B and Seppälä, J and Larjo, A and Latonen, L M and Waltering, K K and Tammela, T L J and Vessella, R L and Lähdesmäki, H and Jänne, O A and Visakorpi, T}, date = {2012-04-26}, note = {{PMID:} 21909140}, keywords = {Animals, Binding Sites, Cell Line, Tumor, Chromatin, Flap Endonucleases, Gene Amplification, Gene Expression Profiling, Humans, Intracellular Signaling Peptides and Proteins, Male, Mice, Nuclear Proteins, Nucleic Acid Amplification Techniques, Prostatic Neoplasms, Receptors, Androgen, S-Phase Kinase-Associated Proteins, Transplantation, Heterologous} } @article{pemberton_genome-wide_2014, title = {Genome-wide co-localization of Polycomb orthologs and their effects on gene expression in human fibroblasts}, volume = {15}, issn = {1465-6914}, doi = {10.1186/gb-2014-15-2-r23}, abstract = {{BACKGROUND:} Polycomb group proteins form multicomponent complexes that are important for establishing lineage-specific patterns of gene expression. Mammalian cells encode multiple permutations of the prototypic Polycomb repressive complex 1 ({PRC1)} with little evidence for functional specialization. An aim of this study is to determine whether the multiple orthologs that are co-expressed in human fibroblasts act on different target genes and whether their genomic location changes during cellular senescence. {RESULTS:} Deep sequencing of chromatin immunoprecipitated with antibodies against {CBX6}, {CBX7}, {CBX8}, {RING1} and {RING2} reveals that the orthologs co-localize at multiple sites. {PCR-based} validation at representative loci suggests that a further six {PRC1} proteins have similar binding patterns. Importantly, sequential chromatin immunoprecipitation with antibodies against different orthologs implies that multiple variants of {PRC1} associate with the same {DNA.} At many loci, the binding profiles have a distinctive architecture that is preserved in two different types of fibroblast. Conversely, there are several hundred loci at which {PRC1} binding is cell type-specific and, contrary to expectations, the presence of {PRC1} does not necessarily equate with transcriptional silencing. Interestingly, the {PRC1} binding profiles are preserved in senescent cells despite changes in gene expression. {CONCLUSIONS:} The multiple permutations of {PRC1} in human fibroblasts congregate at common rather than specific sites in the genome and with overlapping but distinctive binding profiles in different fibroblasts. The data imply that the effects of {PRC1} complexes on gene expression are more subtle than simply repressing the loci at which they bind.}, pages = {R23}, number = {2}, journaltitle = {Genome biology}, shortjournal = {Genome Biol.}, author = {Pemberton, Helen and Anderton, Emma and Patel, Harshil and Brookes, Sharon and Chandler, Hollie and Palermo, Richard and Stock, Julie and Rodriguez-Niedenführ, Marc and Racek, Tomas and de Breed, Lucas and Stewart, Aengus and Matthews, Nik and Peters, Gordon}, date = {2014-02-03}, note = {{PMID:} 24485159} } @article{yu_clusterprofiler_2012, title = {{clusterProfiler:} an R Package for Comparing Biological Themes Among Gene Clusters}, volume = {16}, issn = {1536-2310, 1557-8100}, shorttitle = {{clusterProfiler}}, url = {http://online.liebertpub.com/doi/abs/10.1089/omi.2011.0118}, doi = {10.1089/omi.2011.0118}, number = {5}, urldate = {2012-05-05}, journal = {{OMICS:} A Journal of Integrative Biology}, author = {Yu, Guangchuang and Wang, Li-Gen and Han, Yanyan and He, Qing-Yu}, month = may, year = {2012}, pages = {284--287}, } @article{ashburner_gene_2000, title = {Gene Ontology: tool for the unification of biology}, volume = {25}, issn = {1061-4036}, url = {http://dx.doi.org/10.1038/75556}, doi = {10.1038/75556}, shorttitle = {Gene Ontology}, issue = {1}, pages = {25-29}, journaltitle = {Nat Genet}, shortjournal = {Nat Genet}, author = {Ashburner, Michael and Ball, Catherine A. and Blake, Judith A. and Botstein, David and Butler, Heather and Cherry, J. Michael and Davis, Allan P. and Dolinski, Kara and Dwight, Selina S. and Eppig, Janan T. and Harris, Midori A. and Hill, David P. and Issel-Tarver, Laurie and Kasarskis, Andrew and Lewis, Suzanna and Matese, John C. and Richardson, Joel E. and Ringwald, Martin and Rubin, Gerald M. and Sherlock, Gavin}, urldate = {2010-04-13}, date = {2000-05}, } @article{kanehisa_kegg_2004, title = {The {KEGG} resource for deciphering the genome}, volume = {32}, issn = {0305-1048, 1362-4962}, url = {http://nar.oxfordjournals.org/content/32/suppl_1/D277}, doi = {10.1093/nar/gkh063}, language = {en}, issue = {suppl 1}, pages = {D277-D280}, journaltitle = {Nucleic Acids Research}, shortjournal = {Nucl. Acids Res.}, author = {Kanehisa, Minoru and Goto, Susumu and Kawashima, Shuichi and Okuno, Yasushi and Hattori, Masahiro}, urldate = {2013-10-15}, date = {2004}, note = {{PMID:} 14681412}, } @article{schriml_disease_2011, title = {Disease Ontology: a backbone for disease semantic integration}, volume = {40}, issn = {0305-1048, 1362-4962}, url = {http://nar.oxfordjournals.org/content/40/D1/D940.long}, doi = {10.1093/nar/gkr972}, shorttitle = {Disease Ontology}, issue = {D1}, pages = {D940-D946}, journaltitle = {Nucleic Acids Research}, author = {Schriml, L. M. and Arze, C. and Nadendla, S. and Chang, Y.-W. W. and Mazaitis, M. and Felix, V. and Feng, G. and Kibbe, W. A.}, urldate = {2012-03-01}, date = {2011-11-12}, } @article{croft_reactome_2013, title = {The Reactome pathway knowledgebase}, volume = {42}, issn = {0305-1048, 1362-4962}, url = {http://nar.oxfordjournals.org/content/42/D1/D472.long}, doi = {10.1093/nar/gkt1102}, issue = {D1}, pages = {D472-D477}, journaltitle = {Nucleic Acids Research}, author = {Croft, D. and Mundo, A. F. and Haw, R. and Milacic, M. and Weiser, J. and Wu, G. and Caudy, M. and Garapati, P. and Gillespie, M. and Kamdar, M. R. and Jassal, B. and Jupe, S. and Matthews, L. and May, B. and Palatnik, S. and Rothfels, K. and Shamovsky, V. and Song, H. and Williams, M. and Birney, E. and Hermjakob, H. and Stein, L. and {D'Eustachio}, P.}, urldate = {2014-02-18}, date = {2013-11-15}, } @article{ramirez2016deeptools2, title={deepTools2: a next generation web server for deep-sequencing data analysis}, author={Ram{\'\i}rez, Fidel and Ryan, Devon P and Gr{\"u}ning, Bj{\"o}rn and Bhardwaj, Vivek and Kilpert, Fabian and Richter, Andreas S and Heyne, Steffen and D{\"u}ndar, Friederike and Manke, Thomas}, journal={Nucleic acids research}, volume={44}, number={W1}, pages={W160--W165}, year={2016}, publisher={Oxford University Press} }