Repository: ncic-sugon/SparkSeq
Branch: master
Commit: 45dcc57dc74f
Files: 523
Total size: 114.2 MB
Directory structure:
gitextract_qafxk5fp/
├── .gitignore
├── LICENSE
├── README.md
├── bin/
│ ├── config.properties
│ └── runSparkWGS.sh
├── pom.xml
└── src/
├── main/
│ ├── java/
│ │ ├── com/
│ │ │ └── github/
│ │ │ └── lindenb/
│ │ │ └── jbwa/
│ │ │ └── jni/
│ │ │ ├── AlnRgn.java
│ │ │ ├── BwaFrame.java
│ │ │ ├── BwaIndex.java
│ │ │ ├── BwaMem.java
│ │ │ ├── KSeq.java
│ │ │ └── ShortRead.java
│ │ └── org/
│ │ └── ncic/
│ │ └── bioinfo/
│ │ └── sparkseq/
│ │ ├── algorithms/
│ │ │ ├── adapters/
│ │ │ │ ├── ApplyBQSRAdaptor.java
│ │ │ │ ├── BQSRTableGather.java
│ │ │ │ ├── BaseRecalibratorAdapter.java
│ │ │ │ ├── BwaMemAdapter.java
│ │ │ │ ├── HaplotypeCallerAdapter.java
│ │ │ │ ├── IndelRealignAdapter.java
│ │ │ │ └── MutectAdapter.java
│ │ │ ├── data/
│ │ │ │ ├── basic/
│ │ │ │ │ ├── CountSet.java
│ │ │ │ │ ├── DefaultHashMap.java
│ │ │ │ │ ├── IndexedSet.java
│ │ │ │ │ ├── IntMaxHeap.java
│ │ │ │ │ ├── NestedIntegerArray.java
│ │ │ │ │ ├── Pair.java
│ │ │ │ │ ├── Permutation.java
│ │ │ │ │ └── PrimitivePair.java
│ │ │ │ ├── reference/
│ │ │ │ │ ├── RefContentProvider.java
│ │ │ │ │ ├── RefMetaDataTracker.java
│ │ │ │ │ └── ReferenceContext.java
│ │ │ │ ├── sam/
│ │ │ │ │ ├── AlignmentContext.java
│ │ │ │ │ ├── AlignmentStateMachine.java
│ │ │ │ │ ├── GATKSAMReadGroupRecord.java
│ │ │ │ │ ├── GATKSAMRecord.java
│ │ │ │ │ ├── IntervalLocusSamTraverser.java
│ │ │ │ │ ├── LocusSamTraverser.java
│ │ │ │ │ ├── MergingPileupElementIterator.java
│ │ │ │ │ ├── PileupElement.java
│ │ │ │ │ ├── PileupElementTracker.java
│ │ │ │ │ ├── ReadBackedPileup.java
│ │ │ │ │ ├── ReadBackedPileupImpl.java
│ │ │ │ │ ├── ReadSamTraverser.java
│ │ │ │ │ ├── RegionSamTraverser.java
│ │ │ │ │ ├── SamContentProvider.java
│ │ │ │ │ └── filter/
│ │ │ │ │ ├── BadCigarFilter.java
│ │ │ │ │ ├── BadMateFilter.java
│ │ │ │ │ ├── DuplicateReadFilter.java
│ │ │ │ │ ├── FailsVendorQualityCheckFilter.java
│ │ │ │ │ ├── Filter.java
│ │ │ │ │ ├── FilterUtils.java
│ │ │ │ │ ├── HCMappingQualityFilter.java
│ │ │ │ │ ├── InOriginIntervalFilter.java
│ │ │ │ │ ├── MappingQualityUnavailableFilter.java
│ │ │ │ │ ├── MappingQualityZeroFilter.java
│ │ │ │ │ ├── NegativeAlignmentStartFilter.java
│ │ │ │ │ ├── NotPrimaryAlignmentFilter.java
│ │ │ │ │ ├── SupplementaryReadFilter.java
│ │ │ │ │ └── UnmappedReadFilter.java
│ │ │ │ └── vcf/
│ │ │ │ ├── GATKFeature.java
│ │ │ │ ├── HomoSapiensConstants.java
│ │ │ │ ├── RODContentProvider.java
│ │ │ │ ├── RODNames.java
│ │ │ │ ├── RODRecordList.java
│ │ │ │ ├── RODRecordListImpl.java
│ │ │ │ ├── RODTraverser.java
│ │ │ │ ├── RefMetaTrackerTraverser.java
│ │ │ │ ├── RodBinding.java
│ │ │ │ ├── Tags.java
│ │ │ │ ├── VCFHeaderLineIterable.java
│ │ │ │ └── header/
│ │ │ │ ├── StandardWESVCFHeader.java
│ │ │ │ └── StandardWGSVCFHeader.java
│ │ │ ├── engine/
│ │ │ │ ├── ActiveRegionWalker.java
│ │ │ │ ├── IntervalLocusWalker.java
│ │ │ │ ├── LocusWalker.java
│ │ │ │ ├── RODWalker.java
│ │ │ │ ├── ReadWalker.java
│ │ │ │ └── Walker.java
│ │ │ ├── utils/
│ │ │ │ ├── AlignmentContextUtils.java
│ │ │ │ ├── AlignmentUtils.java
│ │ │ │ ├── AlleleListUtils.java
│ │ │ │ ├── ArtificialSAMUtils.java
│ │ │ │ ├── BaseUtils.java
│ │ │ │ ├── CGAAlignmentUtils.java
│ │ │ │ ├── CigarUtils.java
│ │ │ │ ├── DeprecatedToolChecks.java
│ │ │ │ ├── DocumentedGATKFeature.java
│ │ │ │ ├── EventType.java
│ │ │ │ ├── FragmentUtils.java
│ │ │ │ ├── GATKVariantContextUtils.java
│ │ │ │ ├── GenomeLoc.java
│ │ │ │ ├── GenomeLocParser.java
│ │ │ │ ├── GenomeLocSortedSet.java
│ │ │ │ ├── GenotypingGivenAllelesUtils.java
│ │ │ │ ├── HasGenomeLocation.java
│ │ │ │ ├── HelpConstants.java
│ │ │ │ ├── LRUCache.java
│ │ │ │ ├── MRUCachingSAMSequenceDictionary.java
│ │ │ │ ├── MannWhitneyU.java
│ │ │ │ ├── MathUtils.java
│ │ │ │ ├── NGSPlatform.java
│ │ │ │ ├── QualityUtils.java
│ │ │ │ ├── RandomGenerator.java
│ │ │ │ ├── ReadUtils.java
│ │ │ │ ├── RecalUtils.java
│ │ │ │ ├── SampleListUtils.java
│ │ │ │ ├── SequenceComplexity.java
│ │ │ │ ├── TextFormattingUtils.java
│ │ │ │ ├── Utils.java
│ │ │ │ ├── clip/
│ │ │ │ │ ├── ClippingOp.java
│ │ │ │ │ ├── ClippingRepresentation.java
│ │ │ │ │ └── ReadClipper.java
│ │ │ │ ├── downsampling/
│ │ │ │ │ ├── AlleleBiasedDownsamplingUtils.java
│ │ │ │ │ ├── Downsampler.java
│ │ │ │ │ ├── DownsamplingUtils.java
│ │ │ │ │ ├── LIBSDownsamplingInfo.java
│ │ │ │ │ └── LevelingDownsampler.java
│ │ │ │ ├── fragments/
│ │ │ │ │ └── FragmentCollection.java
│ │ │ │ ├── haplotype/
│ │ │ │ │ ├── EventMap.java
│ │ │ │ │ └── Haplotype.java
│ │ │ │ ├── interval/
│ │ │ │ │ ├── IntervalMergingRule.java
│ │ │ │ │ ├── IntervalSetRule.java
│ │ │ │ │ └── IntervalUtils.java
│ │ │ │ ├── pairhmm/
│ │ │ │ │ ├── ArrayLoglessPairHMM.java
│ │ │ │ │ ├── Log10PairHMM.java
│ │ │ │ │ ├── LoglessPairHMM.java
│ │ │ │ │ ├── N2MemoryPairHMM.java
│ │ │ │ │ ├── PairHMM.java
│ │ │ │ │ ├── PairHMMIndelErrorModel.java
│ │ │ │ │ └── PairHMMModel.java
│ │ │ │ ├── reports/
│ │ │ │ │ ├── GATKReport.java
│ │ │ │ │ ├── GATKReportColumn.java
│ │ │ │ │ ├── GATKReportColumnFormat.java
│ │ │ │ │ ├── GATKReportDataType.java
│ │ │ │ │ ├── GATKReportTable.java
│ │ │ │ │ └── GATKReportVersion.java
│ │ │ │ ├── smithwaterman/
│ │ │ │ │ ├── GlobalEdgeGreedySWPairwiseAlignment.java
│ │ │ │ │ ├── Parameters.java
│ │ │ │ │ ├── SWPairwiseAlignment.java
│ │ │ │ │ ├── SWParameterSet.java
│ │ │ │ │ └── SmithWaterman.java
│ │ │ │ ├── transformers/
│ │ │ │ │ ├── BQSRReadTransformer.java
│ │ │ │ │ ├── MisencodedBaseQualityReadTransformer.java
│ │ │ │ │ └── ReadTransformer.java
│ │ │ │ └── vcfWriter/
│ │ │ │ ├── GVCFWriter.java
│ │ │ │ └── HomRefBlock.java
│ │ │ └── walker/
│ │ │ ├── SerializableActiveRegionMapData.java
│ │ │ ├── baserecalibrator/
│ │ │ │ ├── BAQ.java
│ │ │ │ ├── BaseRecalibrator.java
│ │ │ │ ├── QualQuantizer.java
│ │ │ │ ├── QuantizationInfo.java
│ │ │ │ ├── ReadCovariates.java
│ │ │ │ ├── ReadRecalibrationInfo.java
│ │ │ │ ├── RecalDatum.java
│ │ │ │ ├── RecalibrationArgumentCollection.java
│ │ │ │ ├── RecalibrationEngine.java
│ │ │ │ ├── RecalibrationTables.java
│ │ │ │ └── covariate/
│ │ │ │ ├── ContextCovariate.java
│ │ │ │ ├── Covariate.java
│ │ │ │ ├── CycleCovariate.java
│ │ │ │ ├── ExperimentalCovariate.java
│ │ │ │ ├── QualityScoreCovariate.java
│ │ │ │ ├── ReadGroupCovariate.java
│ │ │ │ ├── RepeatCovariate.java
│ │ │ │ ├── RepeatLengthCovariate.java
│ │ │ │ ├── RepeatUnitAndLengthCovariate.java
│ │ │ │ ├── RepeatUnitCovariate.java
│ │ │ │ ├── RequiredCovariate.java
│ │ │ │ └── StandardCovariate.java
│ │ │ ├── genotypegvcfs/
│ │ │ │ ├── GenotypeGVCFs.java
│ │ │ │ └── ReferenceConfidenceVariantContextMerger.java
│ │ │ ├── haplotypecaller/
│ │ │ │ ├── ActiveRegion.java
│ │ │ │ ├── ActiveRegionFinder.java
│ │ │ │ ├── ActiveRegionMapData.java
│ │ │ │ ├── ActiveRegionTrimmer.java
│ │ │ │ ├── ActivityProfile.java
│ │ │ │ ├── ActivityProfileState.java
│ │ │ │ ├── AlleleList.java
│ │ │ │ ├── AlleleListPermutation.java
│ │ │ │ ├── AssemblyResult.java
│ │ │ │ ├── AssemblyResultSet.java
│ │ │ │ ├── BandPassActivityProfile.java
│ │ │ │ ├── ConsensusAlleleCounter.java
│ │ │ │ ├── DiploidGenotype.java
│ │ │ │ ├── DiploidSNPGenotypeLikelihoods.java
│ │ │ │ ├── ExactACcounts.java
│ │ │ │ ├── GeneralPloidyGenotypeLikelihoods.java
│ │ │ │ ├── GenotypeAlleleCounts.java
│ │ │ │ ├── GenotypeLikelihoodCalculator.java
│ │ │ │ ├── GenotypeLikelihoodCalculators.java
│ │ │ │ ├── GenotypingData.java
│ │ │ │ ├── GenotypingLikelihoods.java
│ │ │ │ ├── GenotypingOutputMode.java
│ │ │ │ ├── HaplotypeCaller.java
│ │ │ │ ├── HaplotypeSizeAndBaseComparator.java
│ │ │ │ ├── HeterogeneousKmerSizeResolution.java
│ │ │ │ ├── IndexedAlleleList.java
│ │ │ │ ├── IndexedSampleList.java
│ │ │ │ ├── MostLikelyAllele.java
│ │ │ │ ├── OutputMode.java
│ │ │ │ ├── PerReadAlleleLikelihoodMap.java
│ │ │ │ ├── PloidyModel.java
│ │ │ │ ├── ReadErrorCorrector.java
│ │ │ │ ├── SampleList.java
│ │ │ │ ├── afcalculate/
│ │ │ │ │ ├── AFCalculationResult.java
│ │ │ │ │ ├── AFCalculator.java
│ │ │ │ │ ├── AFCalculatorImplementation.java
│ │ │ │ │ ├── AFCalculatorProvider.java
│ │ │ │ │ ├── DiploidExactAFCalculator.java
│ │ │ │ │ ├── ExactACset.java
│ │ │ │ │ ├── ExactAFCalculator.java
│ │ │ │ │ ├── FixedAFCalculatorProvider.java
│ │ │ │ │ ├── GeneralPloidyExactAFCalculator.java
│ │ │ │ │ ├── GeneralPloidyFailOverAFCalculatorProvider.java
│ │ │ │ │ ├── IndependentAllelesDiploidExactAFCalculator.java
│ │ │ │ │ ├── OriginalDiploidExactAFCalculator.java
│ │ │ │ │ ├── ReferenceDiploidExactAFCalculator.java
│ │ │ │ │ └── StateTracker.java
│ │ │ │ ├── annotator/
│ │ │ │ │ ├── AlleleBalance.java
│ │ │ │ │ ├── AlleleBalanceBySample.java
│ │ │ │ │ ├── AnnotationInterfaceManager.java
│ │ │ │ │ ├── BaseCounts.java
│ │ │ │ │ ├── BaseQualityRankSumTest.java
│ │ │ │ │ ├── ChromosomeCountConstants.java
│ │ │ │ │ ├── ChromosomeCounts.java
│ │ │ │ │ ├── ClippingRankSumTest.java
│ │ │ │ │ ├── Coverage.java
│ │ │ │ │ ├── DepthPerAlleleBySample.java
│ │ │ │ │ ├── DepthPerSampleHC.java
│ │ │ │ │ ├── FisherStrand.java
│ │ │ │ │ ├── GenotypeSummaries.java
│ │ │ │ │ ├── HaplotypeScore.java
│ │ │ │ │ ├── InbreedingCoeff.java
│ │ │ │ │ ├── LowMQ.java
│ │ │ │ │ ├── MappingQualityRankSumTest.java
│ │ │ │ │ ├── MappingQualityZero.java
│ │ │ │ │ ├── MappingQualityZeroBySample.java
│ │ │ │ │ ├── NBaseCount.java
│ │ │ │ │ ├── QualByDepth.java
│ │ │ │ │ ├── RMSMappingQuality.java
│ │ │ │ │ ├── RankSumTest.java
│ │ │ │ │ ├── ReadPosRankSumTest.java
│ │ │ │ │ ├── SpanningDeletions.java
│ │ │ │ │ ├── StrandBiasBySample.java
│ │ │ │ │ ├── StrandBiasTest.java
│ │ │ │ │ ├── StrandOddsRatio.java
│ │ │ │ │ ├── TandemRepeatAnnotator.java
│ │ │ │ │ ├── VariantAnnotatorEngine.java
│ │ │ │ │ ├── VariantOverlapAnnotator.java
│ │ │ │ │ └── interfaces/
│ │ │ │ │ ├── ActiveRegionBasedAnnotation.java
│ │ │ │ │ ├── AnnotationType.java
│ │ │ │ │ ├── AnnotatorCompatible.java
│ │ │ │ │ ├── ExperimentalAnnotation.java
│ │ │ │ │ ├── GenotypeAnnotation.java
│ │ │ │ │ ├── InfoFieldAnnotation.java
│ │ │ │ │ ├── RodRequiringAnnotation.java
│ │ │ │ │ ├── StandardAnnotation.java
│ │ │ │ │ ├── VariantAnnotatorAnnotation.java
│ │ │ │ │ └── WorkInProgressAnnotation.java
│ │ │ │ ├── argcollection/
│ │ │ │ │ ├── DbsnpArgumentCollection.java
│ │ │ │ │ ├── GenotypeCalculationArgumentCollection.java
│ │ │ │ │ ├── HaplotypeCallerArgumentCollection.java
│ │ │ │ │ ├── StandardCallerArgumentCollection.java
│ │ │ │ │ └── UnifiedArgumentCollection.java
│ │ │ │ ├── genotyper/
│ │ │ │ │ ├── AFPriorProvider.java
│ │ │ │ │ ├── AlleleLikelihoodMatrixMapper.java
│ │ │ │ │ ├── CustomAFPriorProvider.java
│ │ │ │ │ ├── GenotypingEngine.java
│ │ │ │ │ ├── HaplotypeCallerGenotypingEngine.java
│ │ │ │ │ ├── HaplotypeLDCalculator.java
│ │ │ │ │ ├── HeterozygosityAFPriorProvider.java
│ │ │ │ │ ├── InfiniteRandomMatingPopulationModel.java
│ │ │ │ │ ├── LDMerger.java
│ │ │ │ │ ├── MergeVariantsAcrossHaplotypes.java
│ │ │ │ │ ├── UnifiedGenotypingEngine.java
│ │ │ │ │ └── VariantCallContext.java
│ │ │ │ ├── graphs/
│ │ │ │ │ ├── AggregatedSubHaplotypeFinder.java
│ │ │ │ │ ├── BaseEdge.java
│ │ │ │ │ ├── BaseGraph.java
│ │ │ │ │ ├── BaseGraphIterator.java
│ │ │ │ │ ├── BaseVertex.java
│ │ │ │ │ ├── CommonSuffixSplitter.java
│ │ │ │ │ ├── DeBruijnVertex.java
│ │ │ │ │ ├── DeadEndKBestSubHaplotypeFinder.java
│ │ │ │ │ ├── EmptyPathHaplotypeFinderNode.java
│ │ │ │ │ ├── GraphUtils.java
│ │ │ │ │ ├── KBestHaplotype.java
│ │ │ │ │ ├── KBestHaplotypeFinder.java
│ │ │ │ │ ├── KBestSubHaplotypeFinder.java
│ │ │ │ │ ├── KMerCounter.java
│ │ │ │ │ ├── Kmer.java
│ │ │ │ │ ├── KmerSearchableGraph.java
│ │ │ │ │ ├── KmerSequence.java
│ │ │ │ │ ├── LowWeightChainPruner.java
│ │ │ │ │ ├── MultiSampleEdge.java
│ │ │ │ │ ├── Path.java
│ │ │ │ │ ├── RecursiveSubHaplotypeFinder.java
│ │ │ │ │ ├── Route.java
│ │ │ │ │ ├── RouteFinder.java
│ │ │ │ │ ├── SeqGraph.java
│ │ │ │ │ ├── SeqVertex.java
│ │ │ │ │ ├── SharedSequenceMerger.java
│ │ │ │ │ ├── SharedVertexSequenceSplitter.java
│ │ │ │ │ ├── TestGraph.java
│ │ │ │ │ └── VertexOrder.java
│ │ │ │ ├── model/
│ │ │ │ │ ├── ErrorModel.java
│ │ │ │ │ ├── GeneralPloidyGenotypeLikelihoodsCalculationModel.java
│ │ │ │ │ ├── GeneralPloidyIndelGenotypeLikelihoods.java
│ │ │ │ │ ├── GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
│ │ │ │ │ ├── GenotypeLikelihoodsCalculationModel.java
│ │ │ │ │ ├── GenotypingModel.java
│ │ │ │ │ ├── HomogeneousPloidyModel.java
│ │ │ │ │ ├── IndelGenotypeLikelihoodsCalculationModel.java
│ │ │ │ │ ├── ProbabilityVector.java
│ │ │ │ │ ├── RefVsAnyResult.java
│ │ │ │ │ ├── ReferenceConfidenceMode.java
│ │ │ │ │ ├── ReferenceConfidenceModel.java
│ │ │ │ │ └── SNPGenotypeLikelihoodsCalculationModel.java
│ │ │ │ ├── readlikelihood/
│ │ │ │ │ ├── PairHMMLikelihoodCalculationEngine.java
│ │ │ │ │ ├── ReadLikelihoodCalculationEngine.java
│ │ │ │ │ ├── ReadLikelihoods.java
│ │ │ │ │ └── ReadLikelihoodsArrayCache.java
│ │ │ │ └── readthreading/
│ │ │ │ ├── DanglingChainMergingGraph.java
│ │ │ │ ├── HaplotypeGraph.java
│ │ │ │ ├── HaplotypeRoute.java
│ │ │ │ ├── LocalAssemblyEngine.java
│ │ │ │ ├── MultiDeBruijnVertex.java
│ │ │ │ ├── ReadThreadingAssembler.java
│ │ │ │ ├── ReadThreadingGraph.java
│ │ │ │ └── SequenceForKmers.java
│ │ │ ├── indelrealigner/
│ │ │ │ ├── ConsensusDeterminationModel.java
│ │ │ │ ├── ConstrainedMateFixingManager.java
│ │ │ │ ├── IndelRealigner.java
│ │ │ │ └── ReadBin.java
│ │ │ ├── mutect/
│ │ │ │ ├── AbstractPowerCalculator.java
│ │ │ │ ├── CallStatsGenerator.java
│ │ │ │ ├── CandidateMutation.java
│ │ │ │ ├── LocusReadPile.java
│ │ │ │ ├── MuTectArgumentCollection.java
│ │ │ │ ├── MuTectStats.java
│ │ │ │ ├── Mutect.java
│ │ │ │ ├── NormalPowerCalculator.java
│ │ │ │ ├── QualitySums.java
│ │ │ │ ├── SequenceUtils.java
│ │ │ │ ├── TumorPowerCalculator.java
│ │ │ │ ├── VCFGenerator.java
│ │ │ │ └── VariableAllelicRatioGenotypeLikelihoods.java
│ │ │ ├── printreads/
│ │ │ │ ├── BaseRecalibration.java
│ │ │ │ ├── PrintReads.java
│ │ │ │ └── RecalibrationReport.java
│ │ │ └── realignertargetcreator/
│ │ │ ├── Event.java
│ │ │ ├── EventPair.java
│ │ │ ├── EventType.java
│ │ │ └── RealignerTargetCreator.java
│ │ ├── compress/
│ │ │ ├── BaseCompressTools.java
│ │ │ ├── QualityCompressTools.java
│ │ │ └── huffman/
│ │ │ ├── AdaptiveHuffmanCompress.java
│ │ │ ├── AdaptiveHuffmanDecompress.java
│ │ │ ├── BitInputStream.java
│ │ │ ├── BitOutputStream.java
│ │ │ ├── CanonicalCode.java
│ │ │ ├── CodeTree.java
│ │ │ ├── FrequencyTable.java
│ │ │ ├── HuffmanCompress.java
│ │ │ ├── HuffmanDecoder.java
│ │ │ ├── HuffmanDecompress.java
│ │ │ ├── HuffmanEncoder.java
│ │ │ ├── InternalNode.java
│ │ │ ├── Leaf.java
│ │ │ └── Node.java
│ │ ├── debug/
│ │ │ └── DebugWriter.java
│ │ ├── exceptions/
│ │ │ ├── CompressException.java
│ │ │ ├── GATKException.java
│ │ │ ├── ReviewedGATKException.java
│ │ │ └── UserException.java
│ │ ├── fileio/
│ │ │ └── format/
│ │ │ └── SingleFastqInputFormat.java
│ │ └── transfer/
│ │ ├── Basic2SAMRecordTransfer.java
│ │ ├── CollectionConverter.java
│ │ ├── FastqRecord2ShortReadTransfer.java
│ │ ├── GATKReportTransfer.java
│ │ ├── SAMHeaderTransfer.java
│ │ ├── SAMReadGroupRecordTransfer.java
│ │ ├── SAMRecord2BasicTransfer.java
│ │ ├── SAMSequenceDictTransfer.java
│ │ ├── VC2VcfRecordTransfer.java
│ │ └── VcfRecord2VCTransfer.java
│ └── scala/
│ └── org/
│ └── ncic/
│ └── bioinfo/
│ └── sparkseq/
│ ├── WGSPipeline.scala
│ ├── const/
│ │ ├── BinTools.scala
│ │ ├── PipelineConst.scala
│ │ ├── ResourceKeys.scala
│ │ └── SamRecordConst.scala
│ ├── data/
│ │ ├── basic/
│ │ │ ├── BasicSamRecord.scala
│ │ │ ├── FastqPairRecord.scala
│ │ │ ├── FastqRecord.scala
│ │ │ └── VcfRecord.scala
│ │ ├── bundle/
│ │ │ ├── FASTAPartitionBundle.scala
│ │ │ ├── FASTQBundle.scala
│ │ │ ├── FASTQPairBundle.scala
│ │ │ ├── RODBundle.scala
│ │ │ ├── RefPartitionInfoBundle.scala
│ │ │ ├── SAMBundle.scala
│ │ │ ├── VCFBundle.scala
│ │ │ └── VCFPartitionBundle.scala
│ │ ├── common/
│ │ │ ├── Flags.scala
│ │ │ ├── Locus.scala
│ │ │ ├── ReadGroupInfo.scala
│ │ │ ├── RefContigInfo.scala
│ │ │ ├── RefPartitionInfo.scala
│ │ │ ├── SamHeaderInfo.scala
│ │ │ └── VcfHeaderInfo.scala
│ │ └── partition/
│ │ ├── BundlePartition.scala
│ │ ├── FastaPartition.scala
│ │ ├── FastqPairRecordPartition.scala
│ │ ├── FastqRecordPartition.scala
│ │ ├── Partition.scala
│ │ ├── SamRecordPartition.scala
│ │ └── VcfRecordPartition.scala
│ ├── debug/
│ │ └── Dumper.scala
│ ├── engine/
│ │ ├── AbstractProcess.scala
│ │ ├── PartitionOptimizedProcess.scala
│ │ ├── PartitionOptimizer.scala
│ │ ├── Pipeline.scala
│ │ ├── Process.scala
│ │ └── Runnable.scala
│ ├── exceptions/
│ │ ├── IllegalInputException.scala
│ │ ├── PipelineException.scala
│ │ ├── ResourceException.scala
│ │ ├── ResourceNotSetException.scala
│ │ └── ResourceSetException.scala
│ ├── fileio/
│ │ ├── FileLoader.scala
│ │ ├── FileWriter.scala
│ │ ├── HDFSReader.scala
│ │ ├── NormalFileLoader.scala
│ │ └── NormalFileWriter.scala
│ ├── partitioner/
│ │ ├── FastaPartitioner.scala
│ │ ├── FastqPairPartitioner.scala
│ │ ├── FastqPartitioner.scala
│ │ ├── SamRecordPartitioner.scala
│ │ └── VcfPartitioner.scala
│ ├── processes/
│ │ ├── PartitionProcess.scala
│ │ ├── ReadRepartitioner.scala
│ │ ├── cleaning/
│ │ │ ├── BaseRecalibrationProcess.scala
│ │ │ ├── DataCleanProcess.scala
│ │ │ ├── IndelRealignProcess.scala
│ │ │ ├── MarkDuplicateProcess.scala
│ │ │ └── PartitionMarkDuplicateProcess.scala
│ │ ├── mapping/
│ │ │ ├── BwaMappingProcess.scala
│ │ │ └── JNIBwaMemProcess.scala
│ │ └── variantcalling/
│ │ ├── HaplotypeCallerProcess.scala
│ │ ├── MutectProcess.scala
│ │ └── VariantCallingProcess.scala
│ ├── resource/
│ │ ├── AbstractResource.scala
│ │ ├── Resource.scala
│ │ └── ResourcePool.scala
│ └── utils/
│ ├── FileUtils.scala
│ ├── FlagUtils.scala
│ └── StringUtils.scala
└── test/
├── java/
│ └── org/
│ └── ncic/
│ └── bioinfo/
│ └── sparkseq/
│ ├── algorithms/
│ │ ├── TestDebug.java
│ │ ├── adapters/
│ │ │ └── TestBQSRGather.java
│ │ ├── data/
│ │ │ └── sam/
│ │ │ ├── TestIntervalLocusSamTraverser.java
│ │ │ ├── TestLocusSamTraverser.java
│ │ │ ├── TestReadSamTraverser.java
│ │ │ ├── TestRegionSamTraverser.java
│ │ │ ├── TestSamContentProvider.java
│ │ │ └── filter/
│ │ │ └── TestFilterUtils.java
│ │ └── walker/
│ │ ├── AbstractTestCase.java
│ │ ├── TestActiveRegionFinder.java
│ │ ├── TestBaseRecalibrator.java
│ │ ├── TestGenotypeGVCFs.java
│ │ ├── TestHaplotypeCaller.java
│ │ ├── TestIndelRealigner.java
│ │ ├── TestMapDataSerialization.java
│ │ ├── TestMutect.java
│ │ ├── TestPrintReads.java
│ │ └── TestRealignerTargetCreator.java
│ ├── compress/
│ │ ├── TestBaseCompress.java
│ │ ├── TestCompressRate.java
│ │ ├── TestFastqCompress.java
│ │ ├── TestKryo.java
│ │ └── TestQualCompress.java
│ └── transfer/
│ ├── TestRecalTableTransfer.java
│ └── TestSAMRecordTransfer.java
├── resources/
│ ├── activeRegions.txt
│ ├── bqsrtables/
│ │ ├── 0_bqsr.table
│ │ ├── 1_bqsr.table
│ │ ├── 2_bqsr.table
│ │ ├── 3_bqsr.table
│ │ └── merged.table
│ ├── gvcf.vcf
│ ├── head_dbsnp.vcf
│ ├── human_g1k_v37.dict
│ ├── intervaled.sam
│ ├── littleFasta.dict
│ ├── littleFasta.fasta
│ ├── mills_indel.vcf
│ ├── mutect/
│ │ ├── 0_panel.intervals
│ │ ├── LOCUS.txt
│ │ ├── dedup_Blood.sam
│ │ ├── dedup_Tumor.sam
│ │ ├── head_dbsnp.vcf
│ │ ├── out.txt
│ │ ├── out.vcf
│ │ └── refContent
│ ├── raw.snps.indels.vcf
│ ├── realigned_reads.sam
│ ├── recal_data.table
│ ├── recal_reads.sam
│ ├── refheadContent
│ ├── target_interval.list
│ ├── test.bam
│ ├── test.sam
│ ├── test.vcf
│ ├── test1.fastq
│ └── test2.fastq
└── scala/
└── org/
└── ncic/
└── bioinfo/
└── sparkseq/
├── consts/
│ └── TestConst.scala
├── data/
│ ├── TestBasicSamRecord.scala
│ ├── TestFastaPartitioner.scala
│ ├── TestHeaderInfo.scala
│ ├── TestRefContigInfo.scala
│ ├── TestRefScatterInfo.scala
│ └── TestVcfRecord.scala
├── engine/
│ ├── TestEngine.scala
│ └── TestOptimizeEngine.scala
└── fileio/
└── TestNormalFileLoaderWriter.scala
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
.idea/*
target/*
test_result/*
bk/*
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2016 ICT
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
SparkSeq
====
# Introduction
SparkSeq is a programming framework for big genome data analysis process based on [Apache Spark](http://spark.incubator.apache.org/).
In the latest version, WGS pipline is implemented based on this framwork.
# Get Started
## Installing Spark
A Spark release must be on your system before running SparkSeq.
Our work default to Spark 2.1.0 and Hadoop 2.7.0. We also test the program on Spark 1.6.2 and Hadoop 2.6.4.
The latest release of Spark can be download from [Spark website](http://spark.apache.org/downloads.html). More information about installing Spark refers to [Spark Installing Document](https://github.com/apache/spark).
## Build SparkSeq
SparkSeq is built using Apache Maven. To build SparkSeq and its example program, run the following command in shell:
```
$ git clone https://github.com/PAA-NCIC/SparkSeq.git
$ cd SparkSeq
$ mvn clean package
...
[INFO] ------------------------------------------------------------------------
[INFO] BUILD SUCCESS
[INFO] ------------------------------------------------------------------------
[INFO] Total time: 01:00 min
[INFO] Finished at: 2017-04-03T19:43:34+08:00
[INFO] Final Memory: 140M/2083M
[INFO] ------------------------------------------------------------------------
```
## Run a example of WGS
A pipline to call raw variants in VCF file format from raw reads in FASTQ file format is implemented as an example program in GPF.
Before running a WGS pipline, a human b37 reference and its index file must be provided in a storage which can be seen by each node in Spark cluster, as the BWA mem task need to load the index into memory when run .
The known indel/snp vcf files and input FASTQ files must be provided in HDFS or NFS system.
We provide an example running script named "runSparkWGS.sh" in directory "SparkSeq/bin" like following:
```
spark_master=spark://master:7077
driver_memory=30G
executor_memory=30G
total_executor_cores=1024
spark-submit --class org.ncic.bioinfo.sparkseq.WGSPipeline \
--master ${spark_master} \
--driver-memory ${driver_memory} \
--executor-memory ${executor_memory} \
--total-executor-cores ${total_executor_cores} \
/PATH/TO/SparkSeq/target/spark-seq-0.9.0-jar-with-dependencies.jar \
-ref /PATH/TO/human_g1k_v37.fasta \
-dict /PATH/TO/human_g1k_v37.dict \
-fq1 /PATH/TO/DATA/1.fastq \
-fq2 /PATH/TO/DATA/2.fastq \
-output /PATH/TO/OUTPUT/result.vcf \
-1000gindel /PATH/TO/1000G_phase1.indels.b37.vcf \
-millsindel /PATH/TO/Mills_and_1000G_gold_standard.indels.b37.vcf \
-dbsnp /PATH/TO/dbsnp_138.b37.vcf
```
You can run the script by run the following command in shell:
```
sh runSparkWGS.sh
```
The WGS pipeline support all arguments defined by [Apache Spark](http://spark.incubator.apache.org/), and also defines a series of arguments for WGS pipline.
````
Arguments for process defination.
-fq1 : Path to input fastq file 1.
-fq2 : Path to input fastq file 2.
-ref : Path to b37 reference.
-dict : Path to dict file of b37 reference.
-output : Path to write the output VCF file.
-1000gindel : Path to VCF file: 1000G known indels
-millsindel : Path to VCF file: mills and 1000G indels
-dbsnp : Path to VCF file: dbsnp
````
# License
SparkSeq is released under a [GNU General Public License](https://github.com/PAA-NCIC/SparkSeq/master/LICENSE).
================================================
FILE: bin/config.properties
================================================
processOptimize = True
shuffleCompress = True
partitonLength = 2000000
splitPartitionThres = 4
bqsrGatherThreads = 24
activeRegionRepartitionCount = 256
================================================
FILE: bin/runSparkWGS.sh
================================================
spark_master=spark://master:7077
driver_memory=30G
executor_memory=30G
total_executor_cores=1024
spark-submit --class org.ncic.bioinfo.sparkseq.WGSPipeline \
--master ${spark_master} \
--driver-memory ${driver_memory} \
--executor-memory ${executor_memory} \
--total-executor-cores ${total_executor_cores} \
/PATH/TO/SparkSeq/target/spark-seq-0.9.0-jar-with-dependencies.jar \
-ref /PATH/TO/human_g1k_v37.fasta \
-dict /PATH/TO/human_g1k_v37.dict \
-fq1 /PATH/TO/DATA/1.fastq \
-fq2 /PATH/TO/DATA/2.fastq \
-output /PATH/TO/OUTPUT/result.vcf \
-1000gindel /PATH/TO/1000G_phase1.indels.b37.vcf \
-millsindel /PATH/TO/Mills_and_1000G_gold_standard.indels.b37.vcf \
-dbsnp /PATH/TO/dbsnp_138.b37.vcf
================================================
FILE: pom.xml
================================================
4.0.0org.ncic.bioinfospark-seq0.9.01.82.1.02.7.32.10.4net.alchim31.mavenscala-maven-plugin3.2.0compile-scalavalidateadd-sourcecompiletest-compile-scalatest-compileadd-sourcetestCompile${scala.version}org.apache.maven.pluginsmaven-compiler-plugin3.1${java.version}${java.version}maven-assembly-pluginjar-with-dependenciesmake-assemblypackagesinglejunitjunit4.12org.scalatestscalatest_2.103.0.0log4jlog4j1.2.17com.google.guavaguava14.0.1args4jargs4j2.0.23org.apache.commonscommons-io1.3.2org.apache.commonscommons-lang33.0net.java.truecommonstruecommons-io1.0com.github.samtoolshtsjdk1.128org.apache.hadoophadoop-client${hadoop.version}providedasmasmorg.jboss.nettynettyorg.codehaus.jackson*org.sonatype.sisu.inject*javax.servletservlet-apicom.google.guavaguavaorg.apache.sparkspark-core_2.10${spark.version}providedorg.apache.hadoophadoop-clientorg.apache.hadoophadoop-mapreduceorg.apache.commonscommons-math2.2it.unimi.dsifastutil6.3coltcolt1.2.0org.jgraphtjgrapht-jdk1.50.7.3com.esotericsoftwarekryo-shaded3.0.3
================================================
FILE: src/main/java/com/github/lindenb/jbwa/jni/AlnRgn.java
================================================
/*
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2016 Pierre Lindenbaum
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.github.lindenb.jbwa.jni;
public class AlnRgn {
private String chrom;
private long pos;
private byte strand;
private String cigar;
private int mqual;
private int NM;
private int AS;
private int secondary;
public AlnRgn(String chrom, long pos, byte strand, String cigar, int mqual, int NM, int AS, int secondary) {
this.chrom = chrom;
this.pos = pos;
this.strand = strand;
this.cigar = cigar;
this.mqual = mqual;
this.NM = NM;
this.AS = AS;
this.secondary = secondary;
}
public String getChrom() {
return this.chrom;
}
public long getPos() {
return this.pos;
}
public char getStrand() {
return (char) this.strand;
}
public String getCigar() {
return this.cigar;
}
public int getMQual() {
return this.mqual;
}
public int getNm() {
return this.NM;
}
public int getAs() {
return this.AS;
}
public int getSecondary() {
return this.secondary;
}
@Override
public String toString() {
return "" + chrom + ":" + String.valueOf(pos) + "(" + (char) this.strand + ");" + cigar + ";" + mqual + ";" + NM + ";" + AS + ";" + getSecondary();
}
}
================================================
FILE: src/main/java/com/github/lindenb/jbwa/jni/BwaFrame.java
================================================
/*
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2016 Pierre Lindenbaum
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.github.lindenb.jbwa.jni;
import java.awt.BorderLayout;
import java.awt.Dimension;
import java.awt.FlowLayout;
import java.awt.Font;
import java.awt.Toolkit;
import java.awt.event.ActionEvent;
import java.awt.event.WindowAdapter;
import java.awt.event.WindowEvent;
import java.io.File;
import java.util.Vector;
import javax.swing.AbstractAction;
import javax.swing.Action;
import javax.swing.JButton;
import javax.swing.JDialog;
import javax.swing.JFileChooser;
import javax.swing.JFrame;
import javax.swing.JMenu;
import javax.swing.JMenuBar;
import javax.swing.JMenuItem;
import javax.swing.JOptionPane;
import javax.swing.JPanel;
import javax.swing.JScrollPane;
import javax.swing.JTable;
import javax.swing.JTextField;
import javax.swing.SwingUtilities;
import javax.swing.border.EmptyBorder;
import javax.swing.filechooser.FileFilter;
import javax.swing.table.AbstractTableModel;
public class BwaFrame extends JFrame {
private final static String COLS[] = new String[]{"CHROM", "POS", "STRAND", "CIGAR", "MQUAL", "NM", "Secondary"};
private class AlnTableModel extends AbstractTableModel {
private Vector array = new Vector();
@Override
public String getColumnName(int column) {
return COLS[column];
}
@Override
public int getColumnCount() {
return COLS.length;
}
@Override
public int getRowCount() {
return array.size();
}
@Override
public Object getValueAt(int rowIndex, int columnIndex) {
AlnRgn a = this.array.get(rowIndex);
switch (columnIndex) {
case 0:
return a.getChrom();
case 1:
return a.getPos();
case 2:
return a.getStrand();
case 3:
return a.getCigar();
case 4:
return a.getMQual();
case 5:
return a.getNm();
case 6:
return a.getSecondary();
default:
return null;
}
}
@Override
public Class> getColumnClass(int columnIndex) {
switch (columnIndex) {
case 0:
return String.class;
case 1:
return Long.class;
case 2:
return Character.class;
case 3:
return String.class;
case 4:
return Integer.class;
case 5:
return Integer.class;
case 6:
return Integer.class;
default:
return Object.class;
}
}
@Override
public boolean isCellEditable(int arg0, int arg1) {
return false;
}
void clear() {
array.clear();
fireTableDataChanged();
}
void addAll(AlnRgn rgn[]) {
array.clear();
if (rgn != null) for (AlnRgn a : rgn) array.add(a);
fireTableDataChanged();
}
}
private AlnTableModel tableModel;
private JTextField seqField;
private BwaIndex bwaIndex;
private BwaFrame(File f, BwaIndex bwaIndex) {
super("JBWA:" + f);
this.bwaIndex = bwaIndex;
setDefaultCloseOperation(JFrame.DO_NOTHING_ON_CLOSE);
addWindowListener(new WindowAdapter() {
@Override
public void windowClosing(WindowEvent e) {
doMenuClose();
}
});
JMenuBar bar = new JMenuBar();
setJMenuBar(bar);
JPanel mainPane = new JPanel(new BorderLayout(5, 5));
mainPane.setBorder(new EmptyBorder(5, 5, 5, 5));
setContentPane(mainPane);
JPanel pane = new JPanel(new FlowLayout(FlowLayout.LEADING));
mainPane.add(pane, BorderLayout.NORTH);
this.seqField = new JTextField(50);
pane.add(seqField);
Action action = new AbstractAction("Align") {
@Override
public void actionPerformed(ActionEvent arg0) {
doMenuAlign();
}
};
seqField.addActionListener(action);
seqField.setText("CCAANCGCGAGAAGATGACCCAGATCATGTTTGAGACCTTCAACACCCCAGCCATGTACGTGGAGATCGGAAGAGCACACGTCTGAACTCCAGTCACCAA");
pane.add(new JButton(action));
this.tableModel = new AlnTableModel();
JTable table = new JTable(tableModel);
table.setFont(new Font("Courier", 0, 20));
table.setRowHeight(25);
//table.setAutoResizeMode(JTable.AUTO_RESIZE_OFF);
mainPane.add(new JScrollPane(table), BorderLayout.CENTER);
JMenu menu = new JMenu("File");
menu.add(action);
menu.add(new AbstractAction("Quit") {
@Override
public void actionPerformed(ActionEvent arg0) {
doMenuClose();
}
});
}
private void doMenuClose() {
this.bwaIndex.close();
this.setVisible(false);
this.dispose();
}
private void doMenuAlign() {
this.tableModel.clear();
String dna = this.seqField.getText().trim().toUpperCase();
if (dna.length() < 10 || !dna.matches("[ATGNC]+")) {
JOptionPane.showMessageDialog(this, "Bad DNA", "Error", JOptionPane.ERROR_MESSAGE);
return;
}
ShortRead read = new ShortRead("Any", dna.getBytes(), dna.replaceAll("[ANTGC]", "I").getBytes());
BwaMem mem = null;
try {
mem = new BwaMem(this.bwaIndex);
this.tableModel.addAll(mem.align(read));
mem.dispose();
} catch (Exception err) {
err.printStackTrace();
JOptionPane.showMessageDialog(this, "BWA-ERROR", "Error", JOptionPane.ERROR_MESSAGE);
} finally {
if (mem != null) mem.dispose();
}
}
public static void main(String[] args) {
JFrame.setDefaultLookAndFeelDecorated(true);
JDialog.setDefaultLookAndFeelDecorated(true);
System.loadLibrary("bwajni");
File startFile = null;
if (args.length > 0) {
startFile = new File(args[0]);
if (startFile.isFile()) startFile = startFile.getParentFile();
}
JFileChooser selFile = new JFileChooser(startFile);
selFile.setFileFilter(new FileFilter() {
@Override
public String getDescription() {
return "BWA indexed file";
}
@Override
public boolean accept(File f) {
if (!f.isFile()) return true;
String name = f.getName().toLowerCase();
return name.endsWith(".fa.gz") || name.endsWith(".fa") ||
name.endsWith(".fasta.gz") || name.endsWith(".fasta");
}
});
if (selFile.showOpenDialog(null) != JFileChooser.APPROVE_OPTION) return;
File fileIndex = selFile.getSelectedFile();
if (fileIndex == null) return;
System.out.println("Loading " + fileIndex + "...");
BwaIndex index = null;
try {
index = new BwaIndex(fileIndex);
} catch (Exception e) {
System.err.println("Cannot read " + fileIndex);
e.printStackTrace();
return;
}
final BwaFrame frame = new BwaFrame(fileIndex, index);
Dimension screen = Toolkit.getDefaultToolkit().getScreenSize();
frame.setBounds(50, 50, screen.width - 100, screen.height - 100);
try {
SwingUtilities.invokeAndWait(new Runnable() {
@Override
public void run() {
frame.setVisible(true);
}
});
} catch (Exception e) {
e.printStackTrace();
}
}
}
================================================
FILE: src/main/java/com/github/lindenb/jbwa/jni/BwaIndex.java
================================================
/*
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2016 Pierre Lindenbaum
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.github.lindenb.jbwa.jni;
import java.io.File;
import java.io.IOException;
public class BwaIndex {
protected long ref = 0L;
public BwaIndex(File index) throws IOException {
ref = _open(index.toString());
}
@Override
protected void finalize() {
close();
}
public native void close();
private static native long _open(String s) throws IOException;
}
================================================
FILE: src/main/java/com/github/lindenb/jbwa/jni/BwaMem.java
================================================
/*
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2016 Pierre Lindenbaum
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.github.lindenb.jbwa.jni;
import java.io.*;
import java.util.List;
public class BwaMem {
protected long ref = 0L;
private BwaIndex bwaIndex = null;
public BwaMem(BwaIndex bwaIndex) {
this.ref = BwaMem.mem_opt_init();
this.bwaIndex = bwaIndex;
}
public void updateScoringParameters(final int baseMismatchPen,
final int gapOpenPenIns, final int gapOpenPenDel,
final int gapExtPenIns, final int gapExtPenDel,
final int clipPen5, final int clipPen3) {
update_score_parameters(baseMismatchPen, gapOpenPenIns, gapOpenPenDel, gapExtPenIns, gapExtPenDel, clipPen5, clipPen3);
}
public AlnRgn[] align(ShortRead read) throws IOException {
if (ref == 0L) return null;
return align(this.bwaIndex, read.getBases());
}
public String[] align(final List ks1, final List ks2) throws IOException {
if (ref == 0L) return null;
if (ks1 == null) throw new IllegalArgumentException("ks1 is null");
if (ks2 == null) throw new IllegalArgumentException("ks2 is null");
return align(
ks1.toArray(new ShortRead[ks1.size()]),
ks2.toArray(new ShortRead[ks2.size()])
);
}
public String[] align(final ShortRead ks1[], final ShortRead ks2[]) throws IOException {
if (ref == 0L) return null;
if (ks1 == null) throw new IllegalArgumentException("ks1 is null");
if (ks2 == null) throw new IllegalArgumentException("ks2 is null");
if (ks1.length != ks2.length) throw new IllegalArgumentException("ks1.length!=ks2.length");
if (ks1.length == 0) return null;
return align2(this.bwaIndex, ks1, ks2);
}
@Override
protected void finalize() {
dispose();
}
public native void dispose();
private static native long mem_opt_init();
/**
* Verbosity (from http://bio-bwa.sourceforge.net/bwa.shtml#3)
* A value 0 for disabling all the output to stderr;
* 1 for outputting errors only;
* 2 for warnings and errors;
* 3 for all normal messages;
* 4 or higher for debugging. When this option takes value 4, the output is not SAM.
*
* If this method is not called, the default level is 3.
*/
public native void set_verbosity(int verbosity);
private native void update_score_parameters(int B, int Oi, int Od, int Ei, int Ed, int L5, int L3);
private native AlnRgn[] align(BwaIndex bwaIndex, byte bases[]) throws IOException;
private native String[] align2(BwaIndex bwaIndex, final ShortRead ks1[], final ShortRead ks2[]) throws IOException;
}
================================================
FILE: src/main/java/com/github/lindenb/jbwa/jni/KSeq.java
================================================
/*
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2016 Pierre Lindenbaum
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.github.lindenb.jbwa.jni;
import java.io.File;
import java.io.IOException;
public class KSeq {
protected long ref = 0L;
public KSeq(File f) throws IOException {
this.ref = KSeq.init(f == null ? "-" : f.toString());
}
public KSeq() throws IOException {
this(null);
}
public native ShortRead next() throws IOException;
@Override
protected void finalize() {
dispose();
}
public native void dispose();
private static native long init(String file);
}
================================================
FILE: src/main/java/com/github/lindenb/jbwa/jni/ShortRead.java
================================================
/*
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2016 Pierre Lindenbaum
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.github.lindenb.jbwa.jni;
public class ShortRead {
private String name;
private byte[] seq;
private byte[] qual;
public ShortRead(String name, byte[] seq, byte[] qual) {
this.name = name;
this.seq = seq;
this.qual = qual;
}
public String getName() {
return this.name;
}
public byte[] getBases() {
return this.seq;
}
public byte[] getQualities() {
return this.qual;
}
@Override
public String toString() {
return "@" + name + "\n" + new String(this.seq) + "\n+\n" + new String(qual);
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/adapters/ApplyBQSRAdaptor.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.adapters;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import org.ncic.bioinfo.sparkseq.algorithms.data.reference.RefContentProvider;
import org.ncic.bioinfo.sparkseq.algorithms.data.sam.GATKSAMRecord;
import org.ncic.bioinfo.sparkseq.algorithms.data.sam.SamContentProvider;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.RODContentProvider;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.RODNames;
import org.ncic.bioinfo.sparkseq.algorithms.utils.GenomeLocParser;
import org.ncic.bioinfo.sparkseq.algorithms.utils.reports.GATKReport;
import org.ncic.bioinfo.sparkseq.algorithms.walker.printreads.PrintReads;
import org.ncic.bioinfo.sparkseq.data.basic.BasicSamRecord;
import org.ncic.bioinfo.sparkseq.data.common.RefContigInfo;
import org.ncic.bioinfo.sparkseq.data.partition.FastaPartition;
import org.ncic.bioinfo.sparkseq.data.partition.SamRecordPartition;
import org.ncic.bioinfo.sparkseq.data.partition.VcfRecordPartition;
import org.ncic.bioinfo.sparkseq.transfer.GATKReportTransfer;
import org.ncic.bioinfo.sparkseq.transfer.SAMRecord2BasicTransfer;
import org.ncic.bioinfo.sparkseq.transfer.SAMSequenceDictTransfer;
import scala.collection.JavaConversions;
import java.util.ArrayList;
import java.util.List;
/**
* Author: wbc
*/
public class ApplyBQSRAdaptor {
public static List applyBQSR(RefContigInfo refContigInfo,
SamRecordPartition samRecordPartition,
FastaPartition refPartition,
List rodPartitions,
List bqsrTableLines) {
SAMSequenceDictionary samSequenceDictionary = SAMSequenceDictTransfer.transfer(refContigInfo);
GenomeLocParser parser = new GenomeLocParser(samSequenceDictionary);
SamContentProvider samContentProvider = new SamContentProvider(samRecordPartition);
RefContentProvider refContentProvider = new RefContentProvider(samSequenceDictionary, refPartition);
List rodContentProviders = new java.util.ArrayList<>();
rodPartitions.forEach(
rodPartition -> rodContentProviders.add(
new RODContentProvider(RODNames.KNOWN_ALLELES + rodPartition.key(), rodPartition, parser))
);
GATKReport bqsrTable = GATKReportTransfer.lines2Report(bqsrTableLines);
// 使用bqsr进行read transform
PrintReads printReads = new PrintReads(parser, refContentProvider,
samContentProvider, rodContentProviders, bqsrTable);
printReads.run();
// 将结果sam record转化为base sam record
List samRecords = printReads.getResultRecords();
List basicSamRecords = new ArrayList<>();
SAMRecord2BasicTransfer transfer = new SAMRecord2BasicTransfer();
samRecords.forEach(record -> {
basicSamRecords.add(transfer.transfer(record));
});
return basicSamRecords;
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/adapters/BQSRTableGather.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.adapters;
import org.ncic.bioinfo.sparkseq.algorithms.utils.reports.GATKReport;
import org.ncic.bioinfo.sparkseq.algorithms.walker.printreads.RecalibrationReport;
import org.ncic.bioinfo.sparkseq.exceptions.GATKException;
import org.ncic.bioinfo.sparkseq.exceptions.PipelineException;
import org.ncic.bioinfo.sparkseq.transfer.GATKReportTransfer;
import scala.collection.JavaConversions;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.RecursiveTask;
import java.util.concurrent.TimeUnit;
/**
* Author: wbc
*/
public class BQSRTableGather {
public static List gatherBQSRTables(List> reportLinesGroup) {
if (reportLinesGroup.size() == 0) {
throw new PipelineException("No bqsr tables to merge");
}
final RecalibrationReport finalReport = new RecalibrationReport(
GATKReportTransfer.lines2Report(reportLinesGroup.get(0)));
int count = reportLinesGroup.size();
for (int i = 1; i < count; i++) {
List reportLines = reportLinesGroup.get(i);
GATKReport gatkReport = GATKReportTransfer.lines2Report(reportLines);
if (gatkReport.getTable("RecalTable0").getNumRows() != 0) {
finalReport.combine(new RecalibrationReport(gatkReport));
}
}
finalReport.calculateQuantizedQualities();
GATKReport gatkReport = finalReport.createGATKReport();
return GATKReportTransfer.report2Lines(gatkReport);
}
public static List gatherBQSRTablesInParallel(final List> rawReportLinesGroup, int threadCount, int tableCount) {
ExecutorService pool = Executors.newFixedThreadPool(threadCount);
int tableSize = (rawReportLinesGroup.size() < tableCount) ? rawReportLinesGroup.size() : tableCount;
List> reportLinesGroup = rawReportLinesGroup.subList(0, tableSize);
RecalibrationReport[] reports = new RecalibrationReport[reportLinesGroup.size()];
for (int i = 0; i < reportLinesGroup.size(); i++) {
final int reportId = i;
pool.execute(new Runnable() {
@Override
public void run() {
List lines = reportLinesGroup.get(reportId);
GATKReport gatkReport = GATKReportTransfer.lines2Report(lines);
reports[reportId] = new RecalibrationReport(gatkReport);
}
});
}
pool.shutdown();
try {
while (!pool.awaitTermination(1, TimeUnit.SECONDS)) ;
} catch (InterruptedException e) {
e.printStackTrace();
}
RecalibrationReport finalReport = reports[0];
for (int i = 1; i < reports.length; i++) {
if (reports[i].getRecalibrationTables().getTable(0).getDimensions()[0] != 0) {
finalReport.combine(reports[i]);
}
}
finalReport.calculateQuantizedQualities();
GATKReport gatkReport = finalReport.createGATKReport();
return GATKReportTransfer.report2Lines(gatkReport);
/*ForkJoinPool forkJoinPool = new ForkJoinPool();
Future futureResult = forkJoinPool.submit(new ForkJoinMergeTask(reports, 0, reports.length - 1));
try {
RecalibrationReport finalReport = futureResult.get();
finalReport.calculateQuantizedQualities();
GATKReport gatkReport = finalReport.createGATKReport();
return GATKReportTransfer.report2Lines(gatkReport);
} catch (Exception e) {
throw new GATKException("Error when combine BQSR report:" + e.getMessage());
}*/
}
/**
* 暂时弃用,因为结果会不一致
*/
private static class ForkJoinMergeTask extends RecursiveTask {
private int start;
private int stop;
RecalibrationReport[] reports;
public ForkJoinMergeTask(RecalibrationReport[] reports, int start, int end) {
this.reports = reports;
this.start = start;
this.stop = end;
}
public RecalibrationReport compute() {
if (start == stop) {
return reports[start];
} else if (start == stop - 1) {
reports[start].combine(reports[stop]);
return reports[start];
} else {
int middle = (start + stop) / 2;
ForkJoinMergeTask leftTask = new ForkJoinMergeTask(reports, start, middle);
ForkJoinMergeTask rightTask = new ForkJoinMergeTask(reports, middle + 1, stop);
leftTask.fork();
rightTask.fork();
RecalibrationReport leftRes = leftTask.join();
RecalibrationReport rightRes = rightTask.join();
leftRes.combine(rightRes);
return leftRes;
}
}
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/adapters/BaseRecalibratorAdapter.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.adapters;
import htsjdk.samtools.SAMSequenceDictionary;
import org.ncic.bioinfo.sparkseq.algorithms.data.reference.RefContentProvider;
import org.ncic.bioinfo.sparkseq.algorithms.data.sam.SamContentProvider;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.RODContentProvider;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.RODNames;
import org.ncic.bioinfo.sparkseq.algorithms.utils.GenomeLocParser;
import org.ncic.bioinfo.sparkseq.algorithms.utils.reports.GATKReport;
import org.ncic.bioinfo.sparkseq.algorithms.walker.baserecalibrator.BaseRecalibrator;
import org.ncic.bioinfo.sparkseq.data.common.RefContigInfo;
import org.ncic.bioinfo.sparkseq.data.partition.FastaPartition;
import org.ncic.bioinfo.sparkseq.data.partition.SamRecordPartition;
import org.ncic.bioinfo.sparkseq.data.partition.VcfRecordPartition;
import org.ncic.bioinfo.sparkseq.transfer.GATKReportTransfer;
import org.ncic.bioinfo.sparkseq.transfer.SAMSequenceDictTransfer;
import scala.collection.JavaConversions;
import java.util.List;
/**
* Author: wbc
*/
public class BaseRecalibratorAdapter {
public static List getRecalTableLines(RefContigInfo refContigInfo,
SamRecordPartition samRecordPartition,
FastaPartition refPartition,
List rodPartitions) {
SAMSequenceDictionary samSequenceDictionary = SAMSequenceDictTransfer.transfer(refContigInfo);
GenomeLocParser parser = new GenomeLocParser(samSequenceDictionary);
SamContentProvider samContentProvider = new SamContentProvider(samRecordPartition);
RefContentProvider refContentProvider = new RefContentProvider(samSequenceDictionary, refPartition);
List rodContentProviders = new java.util.ArrayList<>();
rodPartitions.forEach(
rodPartition -> rodContentProviders.add(
new RODContentProvider(RODNames.KNOWN_ALLELES + rodPartition.key(), rodPartition, parser))
);
// 生成recal table
BaseRecalibrator baseRecalibrator = new BaseRecalibrator(
parser, refContentProvider, samContentProvider, rodContentProviders);
baseRecalibrator.run();
GATKReport bqsrTable = baseRecalibrator.getReport();
return GATKReportTransfer.report2Lines(bqsrTable);
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/adapters/BwaMemAdapter.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.adapters;
import com.github.lindenb.jbwa.jni.BwaIndex;
import com.github.lindenb.jbwa.jni.BwaMem;
import com.github.lindenb.jbwa.jni.ShortRead;
import org.apache.commons.lang3.StringUtils;
import org.ncic.bioinfo.sparkseq.data.basic.BasicSamRecord;
import org.ncic.bioinfo.sparkseq.data.basic.FastqPairRecord;
import org.ncic.bioinfo.sparkseq.data.common.ReadGroupInfo;
import org.ncic.bioinfo.sparkseq.data.common.RefContigInfo;
import org.ncic.bioinfo.sparkseq.exceptions.PipelineException;
import org.ncic.bioinfo.sparkseq.transfer.FastqRecord2ShortReadTransfer;
import scala.collection.JavaConversions;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* 该类用单例模式控制了每一个进程只加载一份index
*
* Author: wbc
*/
public class BwaMemAdapter {
private static volatile BwaIndex bwaIndexInstance = null;
private static BwaIndex getBwaIndexInstance(String bwaJNILibPath, String referencePath) throws IOException {
if (bwaIndexInstance == null) {
synchronized (BwaMemAdapter.class) {
if (bwaIndexInstance == null) {
System.load(bwaJNILibPath);
bwaIndexInstance = new BwaIndex(new File(referencePath));
}
}
}
return bwaIndexInstance;
}
public static List pairAlign(String bwaJNILibPath,
String referencePath,
ReadGroupInfo readGroupInfo,
RefContigInfo refContigInfo,
List fastqPairRecords) {
List results = new ArrayList<>();
BwaIndex bwaIndex = null;
try {
bwaIndex = getBwaIndexInstance(bwaJNILibPath, referencePath);
} catch (IOException e) {
e.printStackTrace();
throw new PipelineException("Error when load index in JNI bwa");
}
int chunkSize = 1000;
List reads1 = new ArrayList<>(chunkSize);
List reads2 = new ArrayList<>(chunkSize);
for(FastqPairRecord pairRecord : fastqPairRecords) {
if(reads1.size() == chunkSize) {
alignAndAddIntoResult(bwaIndex, results, readGroupInfo, refContigInfo, reads1, reads2);
reads1.clear();
reads2.clear();
}
reads1.add(FastqRecord2ShortReadTransfer.transferRead1(pairRecord));
reads2.add(FastqRecord2ShortReadTransfer.transferRead2(pairRecord));
}
if(reads1.size() > 0) {
alignAndAddIntoResult(bwaIndex, results, readGroupInfo, refContigInfo, reads1, reads2);
}
return results;
}
private static void alignAndAddIntoResult(BwaIndex bwaIndex, List results,
ReadGroupInfo readGroupInfo, RefContigInfo refContigInfo,
List reads1, List reads2){
try {
BwaMem mem = new BwaMem(bwaIndex);
String[] rawResults = mem.align(reads1, reads2);
String rgInfo = "RG:Z:" + readGroupInfo.id();
String rgInfoWithTab = "\t" + rgInfo;
for (String rawResult : rawResults) {
// 这个库有时候会一次出现多条,中间以\n间隔
String[] rawReads = StringUtils.split(rawResult, '\n');
for (String rawRead : rawReads) {
String finalReadString = null;
if (rawRead.endsWith("\t")) {
finalReadString = rawRead + rgInfo;
} else {
finalReadString = rawRead + rgInfoWithTab;
}
results.add(BasicSamRecord.apply(finalReadString, refContigInfo, false));
}
}
} catch (IOException e) {
e.printStackTrace();
throw new PipelineException("Error when call JNI bwa");
}
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/adapters/HaplotypeCallerAdapter.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.adapters;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFHeader;
import org.ncic.bioinfo.sparkseq.algorithms.data.reference.RefContentProvider;
import org.ncic.bioinfo.sparkseq.algorithms.data.sam.SamContentProvider;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.RODContentProvider;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.RODNames;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.VCFHeaderLineIterable;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.header.StandardWGSVCFHeader;
import org.ncic.bioinfo.sparkseq.algorithms.utils.GenomeLocParser;
import org.ncic.bioinfo.sparkseq.algorithms.walker.SerializableActiveRegionMapData;
import org.ncic.bioinfo.sparkseq.algorithms.walker.genotypegvcfs.GenotypeGVCFs;
import org.ncic.bioinfo.sparkseq.algorithms.walker.haplotypecaller.ActiveRegionFinder;
import org.ncic.bioinfo.sparkseq.algorithms.walker.haplotypecaller.ActiveRegionMapData;
import org.ncic.bioinfo.sparkseq.algorithms.walker.haplotypecaller.HaplotypeCaller;
import org.ncic.bioinfo.sparkseq.algorithms.walker.haplotypecaller.SampleList;
import org.ncic.bioinfo.sparkseq.data.basic.VcfRecord;
import org.ncic.bioinfo.sparkseq.data.common.ReadGroupInfo;
import org.ncic.bioinfo.sparkseq.data.common.RefContigInfo;
import org.ncic.bioinfo.sparkseq.data.common.SamHeaderInfo;
import org.ncic.bioinfo.sparkseq.data.common.VcfHeaderInfo;
import org.ncic.bioinfo.sparkseq.data.partition.FastaPartition;
import org.ncic.bioinfo.sparkseq.data.partition.SamRecordPartition;
import org.ncic.bioinfo.sparkseq.data.partition.VcfRecordPartition;
import org.ncic.bioinfo.sparkseq.transfer.*;
import scala.collection.JavaConversions;
import java.util.ArrayList;
import java.util.List;
/**
* Author: wbc
*/
public class HaplotypeCallerAdapter {
public static List getActiveRegions(RefContigInfo refContigInfo,
SamRecordPartition samRecordPartition,
FastaPartition refPartition,
List rodPartitions,
boolean useGVCF) {
SAMSequenceDictionary samSequenceDictionary = SAMSequenceDictTransfer.transfer(refContigInfo);
GenomeLocParser parser = new GenomeLocParser(samSequenceDictionary);
SamContentProvider samContentProvider = new SamContentProvider(samRecordPartition);
RefContentProvider refContentProvider = new RefContentProvider(samSequenceDictionary, refPartition);
List rodContentProviders = new java.util.ArrayList<>();
rodPartitions.forEach(
rodPartition -> rodContentProviders.add(
new RODContentProvider(rodPartition.key(), rodPartition, parser))
);
// 先找出active region的信息
ActiveRegionFinder activeRegionFinder = new ActiveRegionFinder(
parser, refContentProvider, samContentProvider, rodContentProviders, useGVCF);
activeRegionFinder.run();
List activeRegionMapDatas = activeRegionFinder.getResultActiveRegions();
SAMRecord2BasicTransfer samTransfer = new SAMRecord2BasicTransfer();
VCFHeader header = StandardWGSVCFHeader.getHeader();
VC2VcfRecordTransfer vcfTransfer = new VC2VcfRecordTransfer(header, refContigInfo);
List serializableActiveRegionMapDataList = new ArrayList<>(activeRegionMapDatas.size());
for (ActiveRegionMapData mapData : activeRegionMapDatas) {
serializableActiveRegionMapDataList.add(new SerializableActiveRegionMapData(mapData, samTransfer, vcfTransfer));
}
return serializableActiveRegionMapDataList;
}
public static class StaticData {
public HaplotypeCaller haplotypeCaller = null;
public GenomeLocParser genomeLocParser = null;
public Basic2SAMRecordTransfer basic2SAMRecordTransfer = null;
public VCFHeader vcfFileHeader = null;
public VCFCodec codec = null;
}
public static StaticData getStaticDataInstance(RefContigInfo refContigInfo,
boolean useGVCF,
SamHeaderInfo samHeaderInfo,
VcfHeaderInfo vcfHeaderInfo) {
StaticData data = new StaticData();
SAMSequenceDictionary samSequenceDictionary = SAMSequenceDictTransfer.transfer(refContigInfo);
data.genomeLocParser = new GenomeLocParser(samSequenceDictionary);
samHeaderInfo.addReadGroupInfo(ReadGroupInfo.apply("rg1", "sample1"));
SAMFileHeader header = SAMHeaderTransfer.transfer(samHeaderInfo);
List readGroupInfos = header.getReadGroups();
List samples = new ArrayList<>();
for (SAMReadGroupRecord readGroup : readGroupInfos) {
samples.add(readGroup.getSample());
}
data.haplotypeCaller = new HaplotypeCaller(data.genomeLocParser, samples, useGVCF);
data.basic2SAMRecordTransfer = new Basic2SAMRecordTransfer(header);
VCFCodec codec = new VCFCodec();
VCFHeaderLineIterable headerLineIterable = new VCFHeaderLineIterable(vcfHeaderInfo);
data.vcfFileHeader = (VCFHeader) codec.readActualHeader(headerLineIterable);
data.codec = codec;
return data;
}
public static List callVariants(RefContigInfo refContigInfo,
scala.collection.immutable.List serializableActiveRegionMapDataList,
boolean useGVCF,
SamHeaderInfo samHeaderInfo,
VcfHeaderInfo vcfHeaderInfo) {
return callVariants(refContigInfo, CollectionConverter.asJavaList(serializableActiveRegionMapDataList),
useGVCF, samHeaderInfo, vcfHeaderInfo);
}
public static List callVariants(RefContigInfo refContigInfo,
List serializableActiveRegionMapDataList,
boolean useGVCF,
SamHeaderInfo samHeaderInfo,
VcfHeaderInfo vcfHeaderInfo) {
StaticData staticData = getStaticDataInstance(refContigInfo, false, samHeaderInfo, vcfHeaderInfo);
GenomeLocParser parser = staticData.genomeLocParser;
HaplotypeCaller haplotypeCaller = staticData.haplotypeCaller;
Basic2SAMRecordTransfer basic2SAMRecordTransfer = staticData.basic2SAMRecordTransfer;
VCFHeader vcfFileHeader = staticData.vcfFileHeader;
VCFCodec codec = staticData.codec;
List variantContexts = new ArrayList<>();
for(SerializableActiveRegionMapData mapData: serializableActiveRegionMapDataList) {
ActiveRegionMapData activeRegionMapData = mapData.toActiveRegionMapData(
parser, basic2SAMRecordTransfer, vcfFileHeader, codec);
variantContexts.addAll(haplotypeCaller.map(activeRegionMapData));
}
// 如果使用了gvcf,则需要加一个genotypeGVCFs
List finalResult = variantContexts;
VCFHeader header = StandardWGSVCFHeader.getHeader();
VC2VcfRecordTransfer transfer = new VC2VcfRecordTransfer(header, refContigInfo);
List vcfRecords = new ArrayList<>(finalResult.size());
finalResult.forEach(vc -> vcfRecords.add(transfer.transfer(vc)));
return vcfRecords;
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/adapters/IndelRealignAdapter.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.adapters;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import org.ncic.bioinfo.sparkseq.algorithms.data.reference.RefContentProvider;
import org.ncic.bioinfo.sparkseq.algorithms.data.sam.SamContentProvider;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.RODContentProvider;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.RODNames;
import org.ncic.bioinfo.sparkseq.algorithms.utils.GenomeLoc;
import org.ncic.bioinfo.sparkseq.algorithms.utils.GenomeLocParser;
import org.ncic.bioinfo.sparkseq.algorithms.walker.indelrealigner.IndelRealigner;
import org.ncic.bioinfo.sparkseq.algorithms.walker.realignertargetcreator.RealignerTargetCreator;
import org.ncic.bioinfo.sparkseq.data.basic.BasicSamRecord;
import org.ncic.bioinfo.sparkseq.data.common.RefContigInfo;
import org.ncic.bioinfo.sparkseq.data.common.SamHeaderInfo;
import org.ncic.bioinfo.sparkseq.data.partition.FastaPartition;
import org.ncic.bioinfo.sparkseq.data.partition.SamRecordPartition;
import org.ncic.bioinfo.sparkseq.data.partition.VcfRecordPartition;
import org.ncic.bioinfo.sparkseq.transfer.SAMRecord2BasicTransfer;
import org.ncic.bioinfo.sparkseq.transfer.SAMSequenceDictTransfer;
import scala.collection.JavaConversions;
import java.util.ArrayList;
import java.util.List;
/**
* Author: wbc
*/
public class IndelRealignAdapter {
public static List realign(RefContigInfo refContigInfo,
SamRecordPartition samRecordPartition,
FastaPartition refPartition,
List rodPartitions) {
// 数据准备
SAMSequenceDictionary samSequenceDictionary = SAMSequenceDictTransfer.transfer(refContigInfo);
GenomeLocParser parser = new GenomeLocParser(samSequenceDictionary);
SamContentProvider samContentProvider = new SamContentProvider(samRecordPartition);
RefContentProvider refContentProvider = new RefContentProvider(samSequenceDictionary, refPartition);
List rodContentProviders = new java.util.ArrayList<>();
rodPartitions.forEach(
rodPartition -> rodContentProviders.add(
new RODContentProvider(RODNames.KNOWN_ALLELES + rodPartition.key(), rodPartition, parser))
);
// 找target interval
RealignerTargetCreator realignerTargetCreator = new RealignerTargetCreator(
parser, refContentProvider, samContentProvider, rodContentProviders);
realignerTargetCreator.run();
List targetIntervals = realignerTargetCreator.getTargetIntervals();
// realign reads
IndelRealigner indelRealigner = new IndelRealigner(
parser, refContentProvider, samContentProvider, rodContentProviders, targetIntervals);
indelRealigner.run();
// 将结果sam record转化为base sam record
List samRecords = indelRealigner.getResultSam();
List basicSamRecords = new ArrayList<>();
SAMRecord2BasicTransfer transfer = new SAMRecord2BasicTransfer();
samRecords.forEach(record -> {
basicSamRecords.add(transfer.transfer(record));
});
return basicSamRecords;
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/adapters/MutectAdapter.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.adapters;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
import org.ncic.bioinfo.sparkseq.algorithms.data.reference.RefContentProvider;
import org.ncic.bioinfo.sparkseq.algorithms.data.sam.SamContentProvider;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.RODContentProvider;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.header.StandardWGSVCFHeader;
import org.ncic.bioinfo.sparkseq.algorithms.utils.GenomeLoc;
import org.ncic.bioinfo.sparkseq.algorithms.utils.GenomeLocParser;
import org.ncic.bioinfo.sparkseq.algorithms.walker.mutect.Mutect;
import org.ncic.bioinfo.sparkseq.data.basic.VcfRecord;
import org.ncic.bioinfo.sparkseq.data.common.Locus;
import org.ncic.bioinfo.sparkseq.data.common.RefContigInfo;
import org.ncic.bioinfo.sparkseq.data.partition.FastaPartition;
import org.ncic.bioinfo.sparkseq.data.partition.SamRecordPartition;
import org.ncic.bioinfo.sparkseq.data.partition.VcfRecordPartition;
import org.ncic.bioinfo.sparkseq.transfer.SAMSequenceDictTransfer;
import org.ncic.bioinfo.sparkseq.transfer.VC2VcfRecordTransfer;
import scala.collection.JavaConversions;
import java.util.ArrayList;
import java.util.List;
/**
* Author: wbc
*/
public class MutectAdapter {
public static List callVariants(RefContigInfo refContigInfo,
SamRecordPartition tumorSamRecordPartition,
SamRecordPartition normalSamRecordPartition,
FastaPartition refPartition,
List rodPartitions,
List intervals) {
SAMSequenceDictionary samSequenceDictionary = SAMSequenceDictTransfer.transfer(refContigInfo);
GenomeLocParser parser = new GenomeLocParser(samSequenceDictionary);
SamContentProvider tumorSamContentProvider = new SamContentProvider(tumorSamRecordPartition);
SamContentProvider normalSamContentProvider = new SamContentProvider(normalSamRecordPartition);
RefContentProvider refContentProvider = new RefContentProvider(samSequenceDictionary, refPartition);
List rodContentProviders = new java.util.ArrayList<>();
rodPartitions.forEach(
rodPartition -> rodContentProviders.add(
new RODContentProvider(rodPartition.key(), rodPartition, parser))
);
List intervalLocus = new ArrayList<>();
GenomeLoc traverseLocus = refContentProvider.getLocus();
intervals.forEach(
locus -> {
GenomeLoc interval = new GenomeLoc(locus.contigName(), locus.contigId(), locus.start(), locus.stop());
if (interval.overlapsP(traverseLocus)) {
intervalLocus.add(interval);
}
}
);
Mutect mutect = new Mutect(parser, refContentProvider,
tumorSamContentProvider, normalSamContentProvider, rodContentProviders, intervalLocus);
List finalResult = mutect.getResultVCFRecords();
VCFHeader header = StandardWGSVCFHeader.getHeader();
VC2VcfRecordTransfer transfer = new VC2VcfRecordTransfer(header, refContigInfo);
List vcfRecords = new ArrayList<>(finalResult.size());
finalResult.forEach(vc -> vcfRecords.add(transfer.transfer(vc)));
return vcfRecords;
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/data/basic/CountSet.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.data.basic;
import java.lang.reflect.Array;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Set;
/**
* Author: wbc
*/
public class CountSet implements Cloneable, Set {
/**
* The size of the set.
*/
private int size;
/**
* Holds the element of the set within the subrange [0 .. size - 1] in ascending order.
*/
private int[] elements;
/**
* Creates a copy of an existing int-set.
* @param template the intset to copy values from.
*/
public CountSet(final CountSet template) {
elements = template.elements.clone();
size = template.size;
}
/**
* Creates a new set indicating the expected maximum number of elements it will contain.
* @param initialCapacity the desired initial capacity of the set.
* @throws IllegalArgumentException if initialCapacity is negative.
*/
public CountSet(int initialCapacity) {
if (initialCapacity < 0)
throw new IllegalArgumentException();
elements = new int[initialCapacity];
size = 0;
}
/**
* Set the set contents to a single integer value.
* @param value the integer value to set the set to.
*/
public void setTo(int value) {
ensureCapacity(1);
size = 1;
elements[0] = value;
}
/**
* Set the content of this set to a collection of integers.
* @param values the new values to be included in the set.
* @throws NullPointerException if value is null.
*/
public void setTo(int ... values) {
ensureCapacity(values.length);
size = values.length;
System.arraycopy(values, 0, elements, 0, size);
Arrays.sort(elements,0,size);
}
/**
* Increase (or decrease) all elements in the set by a number.
* @param delta the number of add (or substract if negative) to all elements.
*
* @return true if the set changed as a result of this invocation, false otherwise.
*/
public boolean incAll(final int delta) {
if (size == 0 || delta == 0)
return false;
for (int i = 0; i < size; i++)
elements[i] += delta;
return true;
}
/**
* Returns the smallest integer value in the set.
*
* @throws NoSuchElementException if the set is empty (thus there is no minimum).
* @return the smallest integer value in the set.
*/
public int min() {
if (size == 0)
throw new NoSuchElementException("cannot have a min from an empty set");
return elements[0];
}
/**
* Returns the largest integer value in the set.
*
* @throws NoSuchElementException if the set is empty (thus there is no maximum).
* @return the largest integer value in the set.
*/
public int max() {
if (size == 0)
throw new NoSuchElementException("cannot have a max from an empty set");
return elements[size - 1];
}
/**
* Adds a range of integer values to the collection.
*
* This method avoid the need to explicity indicate all values in that range. Notice that the range is fully inclusive.
* You can indicate a decrease range (fromValue > toValue).
*
* @param fromValue the first value to add in the set (inclusive).
* @param toValue the last value to add to the set (inclusive).
* @return true if the set changed as a result of this invocation, false otherwise.
*/
public boolean addRange(final int fromValue, final int toValue) {
final int lowEnd;
final int highEnd;
if (fromValue <= toValue) {
lowEnd = fromValue; highEnd = toValue;
} else {
highEnd = fromValue; lowEnd = toValue;
}
//TODO to be optimized to add missing sub-ranges in one go:
boolean result = false;
for (int i = lowEnd; i <= highEnd; i++)
result = add(i) | result;
return result;
}
/**
* Add an integer value to the set.
* @param value to add to the set.
* @return true if the set changed as a result of this invocation, false otherwise.
*/
public boolean add(final int value) {
int pos = Arrays.binarySearch(elements,0,size,value);
if (pos >= 0) return false;
int insertPos = - pos - 1;
ensureCapacity(size + 1);
System.arraycopy(elements, insertPos, elements, insertPos + 1, size - insertPos);
elements[insertPos] = value;
size++;
return true;
}
/**
* Add a arbitrary number of integers to the set.
*
* @param values integer to add to the set.
* @return true if the set changed as a result of this invocation, false otherwise.
*/
public boolean addAll(final int ... values) {
ensureCapacity(size + values.length);
boolean result = false;
for (final int v : values)
result = add(v) | result;
return result;
}
@Override
public boolean addAll(final Collection extends Integer> numbers) {
ensureCapacity(size + numbers.size());
boolean result = false;
for (final Number n : numbers)
result = add(n.intValue()) | result;
return result;
}
/**
* Add all values within a range in an integer array.
*
* @param source array where the values to add are found.
* @param fromIndex first position from source to add (inclusive).
* @param toIndex index after the last position in source to add (thus exclusive).
* @throws NullPointerException if source is null.
* @throws NegativeArraySizeException if fromIndex or toIndex are negative.
* @throws ArrayIndexOutOfBoundsException if fromIndex or toIndex are beyond bounds
* allowed [0 .. source.length].
* @return true if the set changed as a result of this invocation, false otherwise.
*/
public boolean addAll(final int[] source, final int fromIndex, final int toIndex) {
ensureCapacity(size + source.length);
boolean result = false;
for (int i = fromIndex; i < toIndex; i++)
result = add(source[i]) | result;
return result;
}
/**
* Add all elements present in a int-set.
*
* @param other the other inset.
*
* @throws NullPointerException if other is null.
* @return true if this set changed due to this operation, false otherwise.
*/
public boolean addAll(final CountSet other) {
return addAll(other.elements,0,other.size);
}
/**
* Checks whether a integer value is included in the set.
* @param value the value to check.
* @return true if value is inside the set, false otherwise.
*/
public boolean contains(final int value) {
return Arrays.binarySearch(elements, 0, size, value) >= 0;
}
/**
* Make sure that this int-set has capacity to handle a number of elements.
*
* If the set has already that or greater capacity nothing would be changed.
*
* @param capacity the requested capacity.
*/
private void ensureCapacity(final int capacity) {
if (elements.length >= capacity) return;
int newLength = Math.max(elements.length << 1, capacity);
elements = Arrays.copyOf(elements,newLength);
}
@Override
public int size() {
return size;
}
@Override
public boolean isEmpty() {
return size() == 0;
}
@Override
public boolean contains(final Object o) {
if (o instanceof Integer) {
final int i = (Integer)o;
return contains(i);
} else
return false; //To change body of implemented methods use File | Settings | File Templates.
}
@Override
public Iterator iterator() {
return new MyIterator();
}
@Override
public Object[] toArray() {
final Integer[] result = new Integer[size];
for (int i = 0; i < size; i++)
result[i] = elements[i];
return result;
}
@Override
@SuppressWarnings("unchecked")
public T[] toArray(final T[] a) {
if (a == null)
throw new NullPointerException();
@SuppressWarnings("unchecked")
final Class componentClass = (Class) a.getClass().getComponentType();
if (!componentClass.isAssignableFrom(Integer.class))
throw new ArrayStoreException();
@SuppressWarnings("unchecked")
final T[] dest = (a.length < size) ? (T[]) Array.newInstance(componentClass, size) : a;
for (int i = 0; i < size; i++)
dest[i] = (T) (Integer) elements[i];
return dest;
}
/**
* Copies the content of the set into an integer array. The result can be freely modified by the invoker.
* @return never null but a zero-length array if the set is empty.
*/
public int[] toIntArray() {
return Arrays.copyOfRange(elements,0,size);
}
/**
* Copy the content of the set into an array.
* @param dest the destination array.
* @param offset where to store the first element of the set.
* @throws NullPointerException if dest is null.
* @throws ArrayIndexOutOfBoundsException if offset is out of range of there is not enough
* space after offset in the destination array to hold all values in the set.
*/
public void copyTo(final int[] dest, int offset) {
if (dest == null)
throw new NullPointerException();
if (dest.length < (size + offset))
throw new ArrayIndexOutOfBoundsException("destination is to short");
System.arraycopy(elements,0,dest,offset,size);
}
/**
* Copy the content of the set into an array.
* @param dest the destination array.
* @throws NullPointerException if dest is null.
* @throws ArrayIndexOutOfBoundsException if there is not enough
* space after offset in the destination array to hold all values in the set.
*/
public void copyTo(final int[] dest) {
copyTo(dest,0);
}
@Override
public boolean add(final Integer integer) {
return add((int) integer);
}
@Override
public boolean remove(final Object o) {
return o instanceof Integer && remove((int)o);
}
/**
* Removes a single integer value for the set.
* @param i the value to remove.
* @return true if the set has changed as a result of this invocation, false otherwise.
*/
public boolean remove(final int i) {
final int pos = Arrays.binarySearch(elements,0,size,i);
if (pos < 0)
return false;
else {
removeIndex(pos);
return true;
}
}
@Override
public boolean containsAll(final Collection> c) {
for (final Object o : c)
if (!contains(o))
return false;
return true;
}
@Override
public boolean retainAll(final Collection> c) {
if (size == 0)
return false;
@SuppressWarnings("all")
final CountSet retainIndices = new CountSet(c.size() + 2);
retainIndices.add(-1);
retainIndices.add(size);
for (final Object o : c) {
if (!(o instanceof Integer))
continue;
final int pos = Arrays.binarySearch(elements,0,size,(int) o);
if (pos < 0)
continue;
retainIndices.add(pos);
}
if (retainIndices.size == 2) {
size = 0;
return true;
} else if (retainIndices.size == size + 2) {
return false;
} else {
for (int idx = retainIndices.size - 1; idx > 0; idx--) {
final int toIdx = retainIndices.elements[idx];
final int fromIdx = retainIndices.elements[idx - 1] + 1;
removeIndices(toIdx,fromIdx);
}
return true;
}
}
/**
* Removes the values found in a range of indexes in {@link #elements}.
* @param fromIdx first index to remove (inclusive).
* @param toIdx right after last index to remove (exclusive).
*/
private void removeIndices(final int fromIdx, final int toIdx) {
System.arraycopy(elements,toIdx,elements,fromIdx,size - toIdx);
size -= toIdx - fromIdx;
}
@Override
public boolean removeAll(final Collection> c) {
boolean result = false;
for (final Object o : c)
result = remove(o) | result;
return result;
}
private void removeIndex(int idx) {
System.arraycopy(elements,idx+1,elements,idx,size - idx - 1);
}
@Override
public void clear() {
size = 0;
}
/**
* Returns a copy of this set which can be changed without modifying the original one.
* @return never {@code null}.
*/
@SuppressWarnings("all")
public CountSet clone() {
return new CountSet(this);
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder(2 + size() * 10);
sb.append('{');
for (int i = 0; i < size; i++)
sb.append(elements[i]).append(',');
sb.replace(sb.length()-1,sb.length(),"}");
return sb.toString();
}
/**
* Custom iterator class for {@link CountSet IntSets}
*/
private class MyIterator implements Iterator {
/** What position I am in. */
private int next = 0;
@Override
public boolean hasNext() {
return next < size;
}
@Override
public Integer next() {
if (next >= size)
throw new NoSuchElementException();
return elements[next];
}
@Override
public void remove() {
if (next == 0)
throw new IllegalStateException();
if (next >= size)
throw new NoSuchElementException();
removeIndex(next - 1);
}
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/data/basic/DefaultHashMap.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.data.basic;
import java.util.HashMap;
/**
* Author: wbc
*/
public class DefaultHashMap extends HashMap {
public void setDefaultValue(V defaultValue) {
this.defaultValue = defaultValue;
}
protected V defaultValue;
public DefaultHashMap(V defaultValue) {
this.defaultValue = defaultValue;
}
@Override
public V get(Object k) {
V v = super.get(k);
return ((v == null) && !this.containsKey(k)) ? this.defaultValue : v;
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/data/basic/IndexedSet.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.data.basic;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import java.util.AbstractSet;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
/**
* Set set where each element can be reference by a unique integer index that runs from
* 0 to the size of the set - 1.
*
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
*/
public class IndexedSet extends AbstractSet implements Set {
/**
* Elements stored in an array-list by their index.
*/
private final ArrayList elements;
/**
* A unmodifiable view to the element list. Initially {@code null} it is thread-unsafe lazy instantiated
* when requested first time through {@link #asList}. Therefore typically it is shared by invoking code but
* there could be some extra copies (rare though) in multi-thread runs.
*/
private transient List unmodifiableElementsListView;
/**
* Quick element to index lookup map.
*
* Uses a primitive int value map for efficiency sake.
*
*/
private final Object2IntMap indexByElement;
/**
* Creates an empty indexed set indicating the expected number of elements.
*
* @param initialCapacity the initial number of elements.
*/
public IndexedSet(final int initialCapacity) {
elements = new ArrayList<>(initialCapacity);
indexByElement = new Object2IntOpenHashMap<>(initialCapacity);
}
/**
* Creates a new sample list from a existing collection of elements.
*
*
* Elements will be indexed as they appear in the input array. Repeats will be ignored.
*
*
* @param values the original sample list.
*
* @throws IllegalArgumentException
* if {@code values} array is {@code null} itself, or it contains {@code null}.
*/
@SuppressWarnings("unchecked")
public IndexedSet(final Collection values) {
if (values == null)
throw new IllegalArgumentException("input values cannot be null");
final int initialCapacity = values.size();
elements = new ArrayList<>(initialCapacity);
indexByElement = new Object2IntOpenHashMap<>(initialCapacity);
int nextIndex = 0;
for (final E value : values) {
if (value == null)
throw new IllegalArgumentException("null element not allowed: index == " + nextIndex);
if (indexByElement.containsKey(value))
continue;
indexByElement.put(value, nextIndex++);
elements.add(value);
}
}
/**
* Creates a new sample list from a existing array of elements.
*
*
* Elements will be indexed as they appear in the collection. Repeats will be ignored.
*
*
* @param values the original sample list.
*
* @throws IllegalArgumentException
* if {@code values} collection is {@code null} itself, or it contains {@code null}.
*/
@SuppressWarnings("unchecked")
public IndexedSet(final E ... values) {
if (values == null)
throw new IllegalArgumentException("input values cannot be null");
final int initialCapacity = values.length;
elements = new ArrayList<>(initialCapacity);
indexByElement = new Object2IntOpenHashMap<>(initialCapacity);
int nextIndex = 0;
for (final E value : values) {
if (value == null)
throw new IllegalArgumentException("null element not allowed: index == " + nextIndex);
if (indexByElement.containsKey(value))
continue;
indexByElement.put(value, nextIndex++);
elements.add(value);
}
}
/**
* Returns a list view of the elements in the set.
*
*
* Elements are sorted by their index within the set.
*
*
*
* This view changes as the indexed set changes but it cannot be used to update its contents.
* In such case a {@link UnsupportedOperationException} exception will be thrown if the calling
* code tries to tho just that.
*
*
* @return never {@code null}.
*/
public List asList() {
if (unmodifiableElementsListView == null)
unmodifiableElementsListView = Collections.unmodifiableList(elements);
return unmodifiableElementsListView;
}
/**
* Throws an exception if an index is out of bounds.
*
*
* An element index is valid iff is within [0,{@link #size()}).
*
*
* @param index the query index.
*
* @throws IllegalArgumentException {@code index} is out of bounds.
*/
protected void checkIndex(final int index) {
if (index < 0)
throw new IllegalArgumentException("the index cannot be negative: " + index);
if (index >= size())
throw new IllegalArgumentException("the index is equal or larger than the list length: " + index + " >= " + size());
}
@Override
public Iterator iterator() {
return asList().iterator();
}
/**
* Returns number of elements in the set.
* @return never {@code null}.
*/
@Override
public int size() {
return elements.size();
}
/**
*
* @param o
* @return {@code true} iff {@code o} is in
*/
@Override
@SuppressWarnings("all")
public boolean contains(final Object o) {
return o != null && indexByElement.containsKey(o);
}
/**
* Adds a new element to the set.
*
*
* If the element was already in th set nothing will happen and the method will return {@code false}. However,
* if the element is new to this set, it will assigned the next index available (equal to the size before addition).
* The method will return {@code true} in this case.
*
*
* @param o the object to add.
*
* @throw IllegalArgumentException if {@code o} is {@code null}.
*
* @return {@code true} iff the set was modified by this operation.
*/
@Override
public boolean add(final E o) {
if (o == null)
throw new IllegalArgumentException("the input argument cannot be null");
if (contains(o))
return false;
final int nextIndex = size();
elements.add(o);
indexByElement.put(o, nextIndex);
return true;
}
/**
* Removes an element from the set.
*
*
* If the element was not present in the set, nothing happens and the method return false. However,
* if the element is new to this set, it will be assigned the next index available (equal to the size
* before addition).
* The method will return {@code true} in this case.
*
*
* @param o the object to add.
*
* @throw IllegalArgumentException if {@code o} is {@code null}.
*
* @return {@code true} iff the set was modified by this operation.
*/ @Override
public boolean remove(final Object o) {
final int index = indexByElement.removeInt(o);
if (index == -1)
return false;
elements.remove(index);
indexByElement.remove(o);
final ListIterator it = elements.listIterator(index);
int nextIndex = index;
while (it.hasNext())
indexByElement.put(it.next(),nextIndex++);
return true;
}
/**
* Removes all elements in the set.
*/
@Override
public void clear() {
elements.clear();
indexByElement.clear();
}
/**
* Compares this with another indexed set.
* @param o the other object to compare to.
* @return {@code false} unless {@code o} is a indexed-set that contains the same elements in the same order.
*/
@Override
public boolean equals(final Object o) {
if (o == this)
return true;
if (o == null)
return false;
if (!(o instanceof IndexedSet>))
return false;
final IndexedSet> other = (IndexedSet>)o;
return equals(other);
}
/**
* Compare to another indexed set.
*
* @param other the target indexed set.
*
* @throws java.lang.IllegalArgumentException if {@code other} is {@code null}.
*
* @return {@code true} iff {@other} is not {@code null}, and contains exactly the same elements
* (as compared using {@link Object#equals} a this set with matching indices.
*/
public boolean equals(final IndexedSet> other) {
if (other == null)
throw new IllegalArgumentException("other cannot be null");
final ArrayList> otherElements = other.elements;
final int elementCount = elements.size();
if (otherElements.size() != elementCount)
return false;
for (int i = 0; i < elementCount; i++)
if (!elements.get(i).equals(otherElements.get(i)))
return false;
return true;
}
@Override
public int hashCode() {
int result = 1;
for (final E element : elements)
result = 31 * result + (element == null ? 0 : element.hashCode());
return result;
}
/**
* Returns the element given its index within the set.
* @param index the target element's index.
*
* @throws IllegalArgumentException if {@code index} is not valid; in [0,{@link #size()}).
*
* @return never {@code null}; as null is not a valid element.
*/
public E get(final int index) {
checkIndex(index);
return elements.get(index);
}
/**
* Returns the index of an object.
* @param o the object of interest.
*
* @throws IllegalArgumentException if {@code o} is {@code null}.
*
* @return {@code -1} if such an object is not an element of this set, otherwise is index in the set thus a
* values within [0,{@link #size()}).
*/
public int indexOf(final E o) {
if (o == null)
throw new IllegalArgumentException("the query object cannot be null");
return indexByElement.containsKey(o) ? indexByElement.getInt(o) : -1;
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/data/basic/IntMaxHeap.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.data.basic;
import java.util.Arrays;
/**
* Author: wbc
*/
public class IntMaxHeap {
private int size;
private int[] values;
/**
* Creates a new empty heap indicating its initial capacity.
* @param initialCapacity number of elements you expect to have at most in the heap.
*
* @throws IllegalArgumentException if {@code initialCapacity} is negative.
*/
public IntMaxHeap(final int initialCapacity) {
if (initialCapacity < 0)
throw new IllegalArgumentException();
// We force it to have at least length 1 so that the capacity expansion works when adding;
// it doubles current length and twice 0 = 0.
values = new int[initialCapacity == 0 ? 1 : initialCapacity];
}
/**
* Adds a new element to the heap.
*
*
The heap with grow if it runs out of capacity to hold the new element
*
* @param v the new element.
*/
public void add(final int v) {
// Double capacity if overflow:
ensureCapacity(size + 1);
addWithoutCheckingCapacity(v);
}
/**
* Implements the heap addition floating up the value.
* @param v the value to add.
*/
private void addWithoutCheckingCapacity(final int v) {
int p;
values[p = size++] = v;
// Float up the recently added element:
while (p > 0) {
final int q = (p - 1) >> 1; // parent index.
final int u = values[q]; // parent value.
//Finish check and update:
if (u >= v)
break;
values[p] = u;
values[q] = v;
p = q;
}
}
/**
* Add several integers into the heap.
* @param v values to add.
*/
public void add(final int ... v) {
if (v == null)
throw new IllegalArgumentException("the input array cannot be null");
ensureCapacity(v.length + size);
for (int i : v)
addWithoutCheckingCapacity(i);
}
private void ensureCapacity(final int newSize) {
if (newSize > values.length)
values = Arrays.copyOf(values,Math.max(newSize,10 + values.length << 1));
}
/**
* Returns the current minimum element.
*
* @throws IllegalStateException if the heap is empty.
*
* @return the minimum element in the heap.
*/
public int peek() {
if (size == 0)
throw new IllegalStateException("the heap is empty");
return values[0];
}
/**
* Returns the minimum element of the heap and removes it.
*
* @throws IllegalStateException if the heap is empty.
*
* @return the minimum element in the heap before removing it.
*/
public int remove() {
if (size == 0)
throw new IllegalArgumentException("the heap is empty");
final int result = values[0];
removeUpdate();
return result;
}
/**
* Updates the heap after a removal, sinking the last element from the top-down.
*/
private void removeUpdate() {
// if the remove make the heap to be empty there is nothing to do.
if (--size == 0)
return;
final int v = values[size]; // the last value.
int p;
values[p = 0] = v;
// limit := first index in the heap that does not have any descendants within the heap.
final int limit = (size >> 1);
// Sorry! for the big loop but doesn't seem to be any other *practical* option that would reduce its size.
while (p < limit) {
// Initialize variables:
final int r = (p + 1) << 1; // left descendant index.
final int l = r - 1; // right descendant index (no guarantee to be in the heap).
int u = v; // will contain min(v,values[l],values[r]).
int q = p; // wilL contain argmin_x(values[x], x in {p,l,r}).
// Check left descendant:
int lv = values[l]; // left descendant value.
if (lv > u) { // is the left descendant'v value more than v.
u = lv;
q = l;
}
// Check right descendant:
if (r < size) { // make sure that r is within the heap.
int rv = values[r];
if (rv > u) { // is the right descendant's value less than v or left's
u = rv;
q = r;
}
}
// Finish check and update:
if (p == q) // q == p if neither left or right descendants are less than v.
break;
values[p] = u;
values[q] = v;
p = q;
}
}
/**
* Checks whether the heap is empty.
*
* @return {@code true} iff the heap is empty.
*/
public boolean isEmpty() {
return size == 0;
}
/**
* Returns the current size of the heap.
*
* @return 0 or greater.
*/
public int size() {
return size;
}
/**
* Removes all elements from the heap.
*/
public void clear() {
size = 0;
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/data/basic/NestedIntegerArray.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.data.basic;
import org.apache.log4j.Logger;
import org.ncic.bioinfo.sparkseq.exceptions.ReviewedGATKException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Author: wbc
*/
public class NestedIntegerArray {
private static Logger logger = Logger.getLogger(NestedIntegerArray.class);
protected final Object[] data;
protected final int numDimensions;
protected final int[] dimensions;
// Preallocate the first two dimensions to limit contention during tree traversals in put()
private static final int NUM_DIMENSIONS_TO_PREALLOCATE = 2;
public NestedIntegerArray(final int... dimensions) {
numDimensions = dimensions.length;
if ( numDimensions == 0 )
throw new ReviewedGATKException("There must be at least one dimension to an NestedIntegerArray");
this.dimensions = dimensions.clone();
int dimensionsToPreallocate = Math.min(dimensions.length, NUM_DIMENSIONS_TO_PREALLOCATE);
if ( logger.isDebugEnabled() ) logger.debug(String.format("Creating NestedIntegerArray with dimensions %s", Arrays.toString(dimensions)));
if ( logger.isDebugEnabled() ) logger.debug(String.format("Pre-allocating first %d dimensions", dimensionsToPreallocate));
data = new Object[dimensions[0]];
preallocateArray(data, 0, dimensionsToPreallocate);
if ( logger.isDebugEnabled() ) logger.debug(String.format("Done pre-allocating first %d dimensions", dimensionsToPreallocate));
}
/**
* @return the dimensions of this nested integer array. DO NOT MODIFY
*/
public int[] getDimensions() {
return dimensions;
}
/**
* Recursively allocate the first dimensionsToPreallocate dimensions of the tree
*
* Pre-allocating the first few dimensions helps limit contention during tree traversals in put()
*
* @param subarray current node in the tree
* @param dimension current level in the tree
* @param dimensionsToPreallocate preallocate only this many dimensions (starting from the first)
*/
private void preallocateArray( Object[] subarray, int dimension, int dimensionsToPreallocate ) {
if ( dimension >= dimensionsToPreallocate - 1 ) {
return;
}
for ( int i = 0; i < subarray.length; i++ ) {
subarray[i] = new Object[dimensions[dimension + 1]];
preallocateArray((Object[])subarray[i], dimension + 1, dimensionsToPreallocate);
}
}
public T get(final int... keys) {
final int numNestedDimensions = numDimensions - 1;
Object[] myData = data;
for( int i = 0; i < numNestedDimensions; i++ ) {
if ( keys[i] >= dimensions[i] )
return null;
myData = (Object[])myData[keys[i]];
if ( myData == null )
return null;
}
return (T)myData[keys[numNestedDimensions]];
}
/**
* Insert a value at the position specified by the given keys.
*
* This method is thread-safe, however the caller MUST check the
* return value to see if the put succeeded. This method RETURNS FALSE if
* the value could not be inserted because there already was a value present
* at the specified location. In this case the caller should do a get() to get
* the already-existing value and (potentially) update it.
*
* @param value value to insert
* @param keys keys specifying the location of the value in the tree
* @return true if the value was inserted, false if it could not be inserted because there was already
* a value at the specified position
*/
public boolean put(final T value, final int... keys) { // WARNING! value comes before the keys!
if ( keys.length != numDimensions )
throw new ReviewedGATKException("Exactly " + numDimensions + " keys should be passed to this NestedIntegerArray but " + keys.length + " were provided");
final int numNestedDimensions = numDimensions - 1;
Object[] myData = data;
for ( int i = 0; i < numNestedDimensions; i++ ) {
if ( keys[i] >= dimensions[i] )
throw new ReviewedGATKException("Key " + keys[i] + " is too large for dimension " + i + " (max is " + (dimensions[i]-1) + ")");
// If we're at or beyond the last dimension that was pre-allocated, we need to do a synchronized
// check to see if the next branch exists, and if it doesn't, create it
if ( i >= NUM_DIMENSIONS_TO_PREALLOCATE - 1 ) {
synchronized ( myData ) {
if ( myData[keys[i]] == null ) {
myData[keys[i]] = new Object[dimensions[i + 1]];
}
}
}
myData = (Object[])myData[keys[i]];
}
synchronized ( myData ) { // lock the bottom row while we examine and (potentially) update it
// Insert the new value only if there still isn't any existing value in this position
if ( myData[keys[numNestedDimensions]] == null ) {
myData[keys[numNestedDimensions]] = value;
}
else {
// Already have a value for this leaf (perhaps another thread came along and inserted one
// while we traversed the tree), so return false to notify the caller that we didn't put
// the item
return false;
}
}
return true;
}
public List getAllValues() {
final List result = new ArrayList();
fillAllValues(data, result);
return result;
}
private void fillAllValues(final Object[] array, final List result) {
for ( Object value : array ) {
if ( value == null )
continue;
if ( value instanceof Object[] )
fillAllValues((Object[])value, result);
else
result.add((T)value);
}
}
public static class Leaf {
public final int[] keys;
public final T value;
public Leaf(final int[] keys, final T value) {
this.keys = keys;
this.value = value;
}
}
public List> getAllLeaves() {
final List> result = new ArrayList>();
fillAllLeaves(data, new int[0], result);
return result;
}
private void fillAllLeaves(final Object[] array, final int[] path, final List> result) {
for ( int key = 0; key < array.length; key++ ) {
final Object value = array[key];
if ( value == null )
continue;
final int[] newPath = appendToPath(path, key);
if ( value instanceof Object[] ) {
fillAllLeaves((Object[]) value, newPath, result);
} else {
result.add(new Leaf(newPath, (T)value));
}
}
}
private int[] appendToPath(final int[] path, final int newKey) {
final int[] newPath = new int[path.length + 1];
for ( int i = 0; i < path.length; i++ )
newPath[i] = path[i];
newPath[path.length] = newKey;
return newPath;
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/data/basic/Pair.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.data.basic;
/**
* Author: wbc
*/
public class Pair {
// declare public, STL-style for easier and more efficient access:
public X first;
public Y second;
public Pair(X x, Y y) { first = x; second = y; }
public void set(X x, Y y) { first = x; second = y; }
/** Java-style getter; note that we currently allow direct access to
the member field.
*/
public X getFirst() { return first; }
/** Java-style getter; note that we currently allow direct access to
the member field.
*/
public Y getSecond() { return second; }
/**
* Calculate whether this pair object is equal to another object.
* @param o The other object (hopefully a pair).
* @return True if the two are equal; false otherwise.
*/
@Override
public boolean equals( Object o ) {
if( o == null )
return false;
if( !(o instanceof Pair) )
return false;
Pair other = (Pair)o;
// Check to see whether one is null but not the other.
if( this.first == null && other.first != null ) return false;
if( this.second == null && other.second != null ) return false;
// Check to see whether the values are equal.
// If the param of equals is null, it should by contract return false.
if( this.first != null && !this.first.equals(other.first) ) return false;
if( this.second != null && !this.second.equals(other.second) ) return false;
return true;
}
/**
* Basic hashcode function. Assume hashcodes of first and second are
* randomly distributed and return the XOR of the two.
* @return Randomly distributed hashcode of the pair.
*/
@Override
public int hashCode() {
if( second == null && first == null )
return 0;
if( second == null )
return first.hashCode();
if( first == null )
return second.hashCode();
return first.hashCode() ^ second.hashCode();
}
public String toString() {
return first+","+second;
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/data/basic/Permutation.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.data.basic;
import java.util.List;
/**
* Represent a permutation of a ordered set or list of elements.
*
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
*/
public interface Permutation {
/**
* Checks whether this permutation is a partial one of the original list.
*
*
* A partial permutation is one in that no all original elements take part of.
*
*
* @return {@code true} iff this is a partial permutation.
*/
public boolean isPartial();
/**
* Checks whether this is a trivial permutation where the resulting element list is the same as original.
*
* @return {@code true} iff the resulting element list is the same as the original.
*/
public boolean isNonPermuted();
/**
* Given an index on the original list, returns the position of tha element in the resulting list.
*
* @param fromIndex the query original element index.
*
* @throws IllegalArgumentException if {@code fromIndex} is not a valid index within the original list.
*
* @return -1 if that element is not part of the result (partial) permutation, otherwise some number between
* 0 and {@link #toSize()} - 1.
*/
public int toIndex(final int fromIndex);
/**
* Given an index on the resulting list, it gives you the index of that element on the original list.
* @param toIndex the query resulting list index.
*
* @throws IllegalArgumentException if {@code toIndex} is not a valid index, i.e. in [0,{@link #toSize()}-1).
*
* @return a value between 0 and {@link #fromSize()} - 1.
*/
public int fromIndex(final int toIndex);
/**
* Length of the original element list.
*
* @return 0 or greater.
*/
public int fromSize();
/**
* Length of the resulting element list.
*
* @return 0 or greater.
*/
public int toSize();
/**
* Returns an unmodifiable view to the original element list.
* @return never {@code null}.
*/
public List fromList();
/**
* Returns an unmodifiable view to the original element list.
*
* @return never {@code null}.
*/
public List toList();
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/data/basic/PrimitivePair.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.data.basic;
/**
* Author: wbc
*/
public class PrimitivePair {
/** Pair of two integers */
public static class Int {
// declare public, STL-style for easier and more efficient access:
public int first;
public int second;
public Int(int x, int y) { first = x; second = y; }
public Int() { first = second = 0; }
public void set(int x, int y) { first = x; second = y; }
/** Java-style getter; note that we currently allow direct access to
the member field.
*/
public int getFirst() { return first; }
/** Java-style getter; note that we currently allow direct access to
the member field.
*/
public int getSecond() { return second; }
/** Increments the elements of this pair by the
* corresponding elements of the pair p and returns this
* pair (modified). This method does not allocate a new pair, but changes
* in place the values stored in the object the method is invoked from. The
* method is unsafe: if p is null, a runtime exception will be thrown.
* @param p
* @return
*/
public PrimitivePair.Int add(PrimitivePair.Int p) {
first += p.first;
second += p.second;
return this;
}
/** Decrements the elements of this pair by the
* corresponding elements of the pair p and returns this
* pair (modified). This method does not allocate a new pair, but changes
* in place the values stored in the object the method is invoked from. The
* method is unsafe: if p is null, a runtime exception will be thrown.
* @param p
* @return
*/
public PrimitivePair.Int subtract(PrimitivePair.Int p) {
first -= p.first;
second -= p.second;
return this;
}
/** Copies values from the argument p into the corresponding
* elements of this pair and returns this pair (modified).
* @param p
* @return
*/
public PrimitivePair.Int assignFrom(PrimitivePair.Int p ) {
first = p.first;
second = p.second;
return this;
}
}
public static class Long {
// declare public, STL-style for easier and more efficient access:
public long first;
public long second;
public Long(long x, long y) { first = x; second = y; }
public Long() { first = second = 0; }
public void set(long x, long y) { first = x; second = y; }
/** Java-style getter; note that we currently allow direct access to
the member field.
*/
public long getFirst() { return first; }
/** Java-style getter; note that we currently allow direct access to
the member field.
*/
public long getSecond() { return second; }
/** Increments the elements of this pair by the
* corresponding elements of the pair p and returns this
* pair (modified). This method does not allocate a new pair, but changes
* in place the values stored in the object the method is invoked from. The
* method is unsafe: if p is null, a runtime exception will be thrown.
* @param p
* @return
*/
public PrimitivePair.Long add(PrimitivePair.Int p) {
first += p.first;
second += p.second;
return this;
}
/** Increments the elements of this pair by the
* corresponding elements of the pair p and returns this
* pair (modified). This method does not allocate a new pair, but changes
* in place the values stored in the object the method is invoked from. The
* method is unsafe: if p is null, a runtime exception will be thrown.
* @param p
* @return
*/
public PrimitivePair.Long add(PrimitivePair.Long p) {
first += p.first;
second += p.second;
return this;
}
/** Decrements the elements of this pair by the
* corresponding elements of the pair p and returns this
* pair (modified). This method does not allocate a new pair, but changes
* in place the values stored in the object the method is invoked from. The
* method is unsafe: if p is null, a runtime exception will be thrown.
* @param p
* @return
*/
public PrimitivePair.Long subtract(PrimitivePair.Int p) {
first -= p.first;
second -= p.second;
return this;
}
/** Decrements the elements of this pair by the
* corresponding elements of the pair p and returns this
* pair (modified). This method does not allocate a new pair, but changes
* in place the values stored in the object the method is invoked from. The
* method is unsafe: if p is null, a runtime exception will be thrown.
* @param p
* @return
*/
public PrimitivePair.Long subtract(PrimitivePair.Long p) {
first -= p.first;
second -= p.second;
return this;
}
/** Copies values from the argument p into the corresponding
* elements of this pair and returns this pair (modified).
* @param p
* @return
*/
public PrimitivePair.Long assignFrom(PrimitivePair.Long p ) {
first = p.first;
second = p.second;
return this;
}
/** Copies values from the argument p into the corresponding
* elements of this pair and returns this pair (modified).
* @param p
* @return
*/
public PrimitivePair.Long assignFrom(PrimitivePair.Int p ) {
first = p.first;
second = p.second;
return this;
}
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/data/reference/RefContentProvider.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.data.reference;
import htsjdk.samtools.SAMSequenceDictionary;
import org.ncic.bioinfo.sparkseq.algorithms.utils.GenomeLoc;
import org.ncic.bioinfo.sparkseq.data.partition.FastaPartition;
/**
* 提供reference上的base信息,封装FastaPartition
*
* Author: wbc
*/
public class RefContentProvider {
/**
* 一个contig上的字符组
*/
private final byte[] content;
/**
* contig的Id
*/
private final int contigId;
/**
* contig的name
*/
private final String contigName;
/**
* safe overalp只用于取reference的数据
*/
private final int safeOverlappedStartCoordinate;
private final int safeOverlappedEndCoordinate;
private final int overlappedStartCoordinate;
private final int overlappedEndCoordinate;
private final int originStartCoordinate;
private final int originEndCoordinate;
private final SAMSequenceDictionary samSequenceDictionary;
public RefContentProvider(SAMSequenceDictionary samSequenceDictionary,
FastaPartition fastaPartition) {
contigId = fastaPartition.contigId();
contigName = fastaPartition.contigName();
String rawContent = fastaPartition.content();
int contentLen = rawContent.length();
content = new byte[contentLen];
// 对于ref中不是AGCTN的base,全部换成N
for (int i = 0; i < contentLen; i++) {
byte base = (byte) rawContent.charAt(i);
if (base == 'A' || base == 'G' || base == 'C' || base == 'T') {
content[i] = base;
} else {
content[i] = 'N';
}
}
safeOverlappedStartCoordinate = fastaPartition.safeOverlappedStart();
safeOverlappedEndCoordinate = fastaPartition.safeOverlappedEnd();
overlappedStartCoordinate = fastaPartition.overlappedStart();
overlappedEndCoordinate = fastaPartition.overlappedEnd();
originStartCoordinate = fastaPartition.originStart();
originEndCoordinate = fastaPartition.originEnd();
this.samSequenceDictionary = samSequenceDictionary;
}
/**
* 获取partition负责的interval,是overlapped
*
* @return
*/
public GenomeLoc getLocus() {
return new GenomeLoc(contigName, contigId, overlappedStartCoordinate, overlappedEndCoordinate);
}
/**
* 在截取时会判断是否超出长度,所以返回的长度可能小于locus的长度
*
* @param locus
* @return
*/
public ReferenceContext getReferenceContext(GenomeLoc locus) {
int start = locus.getStart() - safeOverlappedStartCoordinate;
if (start < 0) {
start = 0;
}
int end = locus.getStop() - safeOverlappedStartCoordinate;
if (end >= content.length) {
end = content.length - 1;
}
if(end-start+1 <=0) {
int a = 0;
}
byte[] basesCache = new byte[end - start + 1];
int idx = 0;
for (int i = start; i <= end; i++) {
basesCache[idx] = content[i];
idx++;
}
GenomeLoc newLocus = new GenomeLoc(locus.getContig(), locus.getContigIndex(),
start + safeOverlappedStartCoordinate, end + safeOverlappedStartCoordinate);
return new ReferenceContext(newLocus, contigId, basesCache);
}
public ReferenceContext getReferenceContext(GenomeLoc locus, int overlapLength) {
GenomeLoc newLocus = new GenomeLoc(locus.getContig(), locus.getContigIndex(),
locus.getStart() - overlapLength, locus.getStop() + overlapLength);
return getReferenceContext(newLocus);
}
public SAMSequenceDictionary getSamSequenceDictionary() {
return samSequenceDictionary;
}
public int getOriginStartCoordinate() {
return originStartCoordinate;
}
public int getOriginEndCoordinate() {
return originEndCoordinate;
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/data/reference/RefMetaDataTracker.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.data.reference;
import htsjdk.tribble.Feature;
import org.apache.log4j.Logger;
import org.ncic.bioinfo.sparkseq.algorithms.utils.GenomeLoc;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.GATKFeature;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.RODRecordList;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.RODRecordListImpl;
import org.ncic.bioinfo.sparkseq.algorithms.data.vcf.RodBinding;
import org.ncic.bioinfo.sparkseq.exceptions.UserException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Author: wbc
*/
public class RefMetaDataTracker {
private final static RODRecordList EMPTY_ROD_RECORD_LIST = new RODRecordListImpl("EMPTY");
final Map bindings;
final protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class);
public final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker();
/**
* Create an tracker with no bindings
*/
public RefMetaDataTracker() {
bindings = Collections.emptyMap();
}
public RefMetaDataTracker(final Collection allBindings) {
// set up the bindings
if (allBindings.isEmpty())
bindings = Collections.emptyMap();
else {
final Map tmap = new HashMap(allBindings.size());
for (RODRecordList rod : allBindings) {
if (rod != null && !rod.isEmpty())
tmap.put(canonicalName(rod.getName()), rod);
}
// ensure that no one modifies the bindings itself
bindings = Collections.unmodifiableMap(tmap);
}
}
// ------------------------------------------------------------------------------------------
//
//
// Generic accessors
//
//
// ------------------------------------------------------------------------------------------
/**
* Gets all of the Tribble features spanning this locus, returning them as a list of specific
* type T extending Feature. This function looks across all tracks to find the Features, so
* if you have two tracks A and B each containing 1 Feature, then getValues will return
* a list containing both features.
*
* Note that this function assumes that all of the bound features are instances of or
* subclasses of T. A ClassCastException will occur if this isn't the case. If you want
* to get all Features without any danger of such an exception use the root Tribble
* interface Feature.
*
* @param type The type of the underlying objects bound here
* @param as above
* @return A freshly allocated list of all of the bindings, or an empty list if none are bound.
*/
public List getValues(final Class type) {
return addValues(bindings.keySet(), type, new ArrayList(), null, false, false);
}
/**
* Provides the same functionality as @link #getValues(Class) but will only include
* Features that start as the GenomeLoc provide onlyAtThisLoc.
*
* @param type The type of the underlying objects bound here
* @param onlyAtThisLoc
* @param as above
* @return A freshly allocated list of all of the bindings, or an empty list if none are bound.
*/
public List getValues(final Class type, final GenomeLoc onlyAtThisLoc) {
return addValues(bindings.keySet(), type, new ArrayList(), onlyAtThisLoc, true, false);
}
/**
* Uses the same logic as @link #getValues(Class) but arbitrary select one of the resulting
* elements of the list to return. That is, if there would be two elements in the result of
*
* @param type The type of the underlying objects bound here
* @param as above
* @return A random single element the RODs bound here, or null if none are bound.
* @link #getValues(Class), one of these two is selected, and which one it will be isn't
* specified. Consequently, this method is only really safe if (1) you absolutely know
* that only one binding will meet the constraints of @link #getValues(Class) or (2)
* you truly don't care which of the multiple bindings available you are going to examine.
*
* If there are no bindings here, getFirstValue() return null
*/
public T getFirstValue(final Class type) {
return safeGetFirst(getValues(type));
}
/**
* Uses the same logic as @link #getValue(Class,GenomeLoc) to determine the list
* of eligible Features and @link #getFirstValue(Class) to select a single
* element from the interval list.
*
* @param type The type of the underlying objects bound here
* @param as above
* @param onlyAtThisLoc only Features starting at this site are considered
* @return A random single element the RODs bound here starting at onlyAtThisLoc, or null if none are bound.
*/
public T getFirstValue(final Class type, final GenomeLoc onlyAtThisLoc) {
return safeGetFirst(getValues(type, onlyAtThisLoc));
}
/**
* Same logic as @link #getFirstValue(RodBinding, boolean) but prioritizes records from prioritizeThisLoc if available
*
* @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched
* @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features
* @param prioritizeThisLoc only Features starting at this site are considered
* @return A freshly allocated list of all of the bindings, or an empty list if none are bound.
*/
public List getPrioritizedValue(final Collection> rodBindings, final GenomeLoc prioritizeThisLoc) {
final List results = new ArrayList<>();
for (final RodBinding rodBinding : rodBindings) {
// if there's a value at the prioritized location, take it
T value = getFirstValue(rodBinding, prioritizeThisLoc);
// otherwise, grab any one
if (value == null)
value = getFirstValue(rodBinding);
// add if not null
if (value != null)
results.add(value);
}
return results;
}
/**
* Gets all of the Tribble features bound to RodBinding spanning this locus, returning them as
* a list of specific type T extending Feature.
*
* Note that this function assumes that all of the bound features are instances of or
* subclasses of T. A ClassCastException will occur if this isn't the case.
*
* @param rodBinding Only Features coming from the track associated with this rodBinding are fetched
* @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features
* @return A freshly allocated list of all of the bindings, or an empty list if none are bound.
*/
public List getValues(final RodBinding rodBinding) {
return addValues(rodBinding.getName(), rodBinding.getType(), new ArrayList(1), getTrackDataByName(rodBinding), null, false, false);
}
/**
* Gets all of the Tribble features bound to any RodBinding in rodBindings,
* spanning this locus, returning them as a list of specific type T extending Feature.
*
* Note that this function assumes that all of the bound features are instances of or
* subclasses of T. A ClassCastException will occur if this isn't the case.
*
* @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched
* @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features
* @return A freshly allocated list of all of the bindings, or an empty list if none are bound.
*/
public List getValues(final Collection> rodBindings) {
List results = new ArrayList(1);
for (RodBinding rodBinding : rodBindings)
results.addAll(getValues(rodBinding));
return results;
}
public List getValues(String name) {
List results = new ArrayList(1);
RODRecordList recordList = bindings.get(name);
if (recordList != null) {
for (GATKFeature feature : recordList)
results.add((T) feature);
}
return results;
}
public List getAllValues() {
List results = new ArrayList(1);
for (RODRecordList recordList : bindings.values()) {
if (recordList != null) {
for (GATKFeature feature : recordList)
results.add((T) feature);
}
}
return results;
}
/**
* The same logic as @link #getValues(RodBinding) but enforces that each Feature start at onlyAtThisLoc
*
* @param rodBinding Only Features coming from the track associated with this rodBinding are fetched
* @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features
* @param onlyAtThisLoc only Features starting at this site are considered
* @return A freshly allocated list of all of the bindings, or an empty list if none are bound.
*/
public List getValues(final RodBinding rodBinding, final GenomeLoc onlyAtThisLoc) {
return addValues(rodBinding.getName(), rodBinding.getType(), new ArrayList(1), getTrackDataByName(rodBinding), onlyAtThisLoc, true, false);
}
/**
* The same logic as @link #getValues(List) but enforces that each Feature start at onlyAtThisLoc
*
* @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched
* @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features
* @param onlyAtThisLoc only Features starting at this site are considered
* @return A freshly allocated list of all of the bindings, or an empty list if none are bound.
*/
public List getValues(final Collection> rodBindings, final GenomeLoc onlyAtThisLoc) {
List results = new ArrayList(1);
for (RodBinding rodBinding : rodBindings)
results.addAll(getValues(rodBinding, onlyAtThisLoc));
return results;
}
/**
* Uses the same logic as @getValues(RodBinding) to determine the list
* of eligible Features and select a single element from the resulting set
* of eligible features.
*
* @param rodBinding Only Features coming from the track associated with this rodBinding are fetched
* @param as above
* @return A random single element the eligible Features found, or null if none are bound.
*/
public T getFirstValue(final RodBinding rodBinding) {
return safeGetFirst(addValues(rodBinding.getName(), rodBinding.getType(), null, getTrackDataByName(rodBinding), null, false, true));
}
/**
* Uses the same logic as @getValues(RodBinding, GenomeLoc) to determine the list
* of eligible Features and select a single element from the resulting set
* of eligible features.
*
* @param rodBinding Only Features coming from the track associated with this rodBinding are fetched
* @param as above
* @param onlyAtThisLoc only Features starting at this site are considered
* @return A random single element the eligible Features found, or null if none are bound.
*/
public T getFirstValue(final RodBinding rodBinding, final GenomeLoc onlyAtThisLoc) {
return safeGetFirst(addValues(rodBinding.getName(), rodBinding.getType(), null, getTrackDataByName(rodBinding), onlyAtThisLoc, true, true));
}
/**
* Uses the same logic as @getValues(List) to determine the list
* of eligible Features and select a single element from the resulting set
* of eligible features.
*
* @param rodBindings Only Features coming from the tracks associated with these rodBindings are fetched
* @param as above
* @return A random single element the eligible Features found, or null if none are bound.
*/
public T getFirstValue(final Collection> rodBindings) {
for (RodBinding rodBinding : rodBindings) {
T val = getFirstValue(rodBinding);
if (val != null)
return val;
}
return null;
}
/**
* Uses the same logic as @getValues(RodBinding,GenomeLoc) to determine the list
* of eligible Features and select a single element from the resulting set
* of eligible features.
*
* @param rodBindings Only Features coming from the tracks associated with these rodBindings are fetched
* @param as above
* @param onlyAtThisLoc only Features starting at this site are considered
* @return A random single element the eligible Features found, or null if none are bound.
*/
public T getFirstValue(final Collection> rodBindings, final GenomeLoc onlyAtThisLoc) {
for (RodBinding rodBinding : rodBindings) {
T val = getFirstValue(rodBinding, onlyAtThisLoc);
if (val != null)
return val;
}
return null;
}
/**
* Is there a binding at this site to a ROD/track with the specified name?
*
* @param rodBinding the rod binding we want to know about
* @return true if any Features are bound in this tracker to rodBinding
*/
public boolean hasValues(final RodBinding rodBinding) {
return bindings.containsKey(canonicalName(rodBinding.getName()));
}
/**
* Get all of the RMD tracks at the current site. Each track is returned as a single compound
* object (RODRecordList) that may contain multiple RMD records associated with the current site.
*
* @return List of all tracks
*/
public List getBoundRodTracks() {
return new ArrayList(bindings.values());
}
/**
* The number of tracks with at least one value bound here
*
* @return the number of tracks with at least one bound Feature
*/
public int getNTracksWithBoundFeatures() {
return bindings.size();
}
// ------------------------------------------------------------------------------------------
// Protected accessors using strings for unit testing
// ------------------------------------------------------------------------------------------
protected boolean hasValues(final String name) {
return bindings.containsKey(canonicalName(name));
}
protected List getValues(final Class type, final String name) {
return addValues(name, type, new ArrayList(), getTrackDataByName(name), null, false, false);
}
protected List getValues(final Class type, final String name, final GenomeLoc onlyAtThisLoc) {
return addValues(name, type, new ArrayList(), getTrackDataByName(name), onlyAtThisLoc, true, false);
}
protected T getFirstValue(final Class type, final String name) {
return safeGetFirst(getValues(type, name));
}
protected T getFirstValue(final Class type, final String name, final GenomeLoc onlyAtThisLoc) {
return safeGetFirst(getValues(type, name, onlyAtThisLoc));
}
// ------------------------------------------------------------------------------------------
//
//
// Private utility functions
//
//
// ------------------------------------------------------------------------------------------
/**
* Helper function for getFirst() operations that takes a list of and
* returns the first element, or null if no such element exists.
*
* @param l
* @param
* @return
*/
private T safeGetFirst(final List l) {
return l.isEmpty() ? null : l.get(0);
}
private List addValues(final Collection names,
final Class type,
List values,
final GenomeLoc curLocation,
final boolean requireStartHere,
final boolean takeFirstOnly) {
for (String name : names) {
RODRecordList rodList = getTrackDataByName(name); // require that the name is an exact match
values = addValues(name, type, values, rodList, curLocation, requireStartHere, takeFirstOnly);
if (takeFirstOnly && !values.isEmpty())
break;
}
return values;
}
private List addValues(final String name,
final Class type,
List values,
final RODRecordList rodList,
final GenomeLoc curLocation,
final boolean requireStartHere,
final boolean takeFirstOnly) {
for (GATKFeature rec : rodList) {
if (!requireStartHere || rec.getLocation().getStart() == curLocation.getStart()) { // ok, we are going to keep this thing
Object obj = rec.getUnderlyingObject();
if (!(type.isAssignableFrom(obj.getClass())))
throw new UserException.CommandLineException("Unable to cast track named " + name + " to type of " + type.toString()
+ " it's of type " + obj.getClass());
T objT = (T) obj;
if (takeFirstOnly) {
if (values == null)
values = Arrays.asList(objT);
else
values.add(objT);
break;
} else {
if (values == null)
values = new ArrayList();
values.add(objT);
}
}
}
return values == null ? Collections.emptyList() : values;
}
/**
* Finds the reference metadata track named 'name' and returns all ROD records from that track associated
* with the current site as a RODRecordList List object. If no data track with specified name is available,
* returns defaultValue wrapped as RODRecordList object. NOTE: if defaultValue is null, it will be wrapped up
* with track name set to 'name' and location set to null; otherwise the wrapper object will have name and
* location set to defaultValue.getID() and defaultValue.getLocation(), respectively (use caution,
* defaultValue.getLocation() may be not equal to what RODRecordList's location would be expected to be otherwise:
* for instance, on locus traversal, location is usually expected to be a single base we are currently looking at,
* regardless of the presence of "extended" RODs overlapping with that location).
*
* @param name track name
* @return track data for the given rod
*/
private RODRecordList getTrackDataByName(final String name) {
final String luName = canonicalName(name);
RODRecordList l = bindings.get(luName);
return l == null ? EMPTY_ROD_RECORD_LIST : l;
}
private RODRecordList getTrackDataByName(final RodBinding binding) {
return getTrackDataByName(binding.getName());
}
/**
* Returns the canonical name of the rod name (lowercases it)
*
* @param name the name of the rod
* @return canonical name of the rod
*/
private String canonicalName(final String name) {
// todo -- remove me after switch to RodBinding syntax
return name.toLowerCase();
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/data/reference/ReferenceContext.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.data.reference;
import org.ncic.bioinfo.sparkseq.algorithms.utils.GenomeLoc;
/**
* Author: wbc
*/
public class ReferenceContext {
final int contigId;
final private GenomeLoc locus;
final private byte[] bases;
/**
* The window of reference information around the current locus.
*/
final private GenomeLoc window;
public ReferenceContext(GenomeLoc locus, int contigId, byte[] bases) {
this.locus = locus;
this.window = locus;
this.contigId = contigId;
this.bases = bases;
}
public ReferenceContext(GenomeLoc locus, GenomeLoc window, int contigId, byte[] bases) {
this.locus = locus;
this.window = window;
this.contigId = contigId;
this.bases = bases;
}
/**
* Contig id of this reference
*
* @return contig id
*/
public int getContigId() {
return contigId;
}
/**
* The locus currently being examined.
*
* @return The current locus.
*/
public GenomeLoc getLocus() {
return locus;
}
public GenomeLoc getWindow() {
return window;
}
/**
* Get the base at the given locus.
*
* @return The base at the given locus from the reference.
*/
public byte getBase() {
return bases[0];
}
/**
* All the bases in the window currently being examined.
*
* @return All bases available. If the window is of size [0,0], the array will
* contain only the base at the given locus.
*/
public byte[] getBases() {
return bases;
}
public byte[] getForwardBases() {
final byte[] bases = getBases();
final int mid = locus.getStart() - window.getStart();
// todo -- warning of performance problem, especially if this is called over and over
return new String(bases).substring(mid).getBytes();
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/data/sam/AlignmentContext.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.data.sam;
import org.ncic.bioinfo.sparkseq.exceptions.ReviewedGATKException;
import org.ncic.bioinfo.sparkseq.algorithms.utils.GenomeLoc;
import org.ncic.bioinfo.sparkseq.algorithms.utils.HasGenomeLocation;
/**
* Useful class for forwarding on locusContext data from this iterator
* Author: wbc
*/
public class AlignmentContext implements HasGenomeLocation {
protected GenomeLoc loc = null;
protected ReadBackedPileup basePileup = null;
protected boolean hasPileupBeenDownsampled;
/**
* The number of bases we've skipped over in the reference since the last map invocation.
* Only filled in by RodTraversals right now. By default, nothing is being skipped, so skippedBases == 0.
*/
private long skippedBases = 0;
public AlignmentContext(GenomeLoc loc, ReadBackedPileup basePileup) {
this(loc, basePileup, 0, false);
}
public AlignmentContext(GenomeLoc loc, ReadBackedPileup basePileup, long skippedBases, boolean hasPileupBeenDownsampled) {
if (loc == null)
throw new ReviewedGATKException("BUG: GenomeLoc in Alignment context is null");
if (basePileup == null)
throw new ReviewedGATKException("BUG: ReadBackedPileup in Alignment context is null");
if (skippedBases < 0)
throw new ReviewedGATKException("BUG: skippedBases is -1 in Alignment context");
this.loc = loc;
this.basePileup = basePileup;
this.skippedBases = skippedBases;
this.hasPileupBeenDownsampled = hasPileupBeenDownsampled;
}
/**
* Returns base pileup over the current genomic location. May return null if this context keeps only
* extended event (indel) pileup.
*
* @return
*/
public ReadBackedPileup getBasePileup() {
return basePileup;
}
/**
* Returns true if any reads have been filtered out of the pileup due to excess DoC.
*
* @return True if reads have been filtered out. False otherwise.
*/
public boolean hasPileupBeenDownsampled() {
return hasPileupBeenDownsampled;
}
/**
* How many reads cover this locus?
*
* @return
*/
public int size() {
return basePileup.getNumberOfElements();
}
public String getContig() {
return getLocation().getContig();
}
public long getPosition() {
return getLocation().getStart();
}
public GenomeLoc getLocation() {
return loc;
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/data/sam/AlignmentStateMachine.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.data.sam;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import org.ncic.bioinfo.sparkseq.exceptions.UserException;
/**
* Steps a single read along its alignment to the genome
*
* The logical model for generating extended events is as follows: the "record state"
* implements the traversal along the reference; thus stepForwardOnGenome() returns
* on every and only on actual reference bases. This can be a (mis)match or a deletion
* (in the latter case, we still return on every individual reference base the deletion spans).
*
* User: depristo
* Date: 1/5/13
* Time: 1:08 PM
*/
public class AlignmentStateMachine {
/**
* Our read
*/
private final GATKSAMRecord read;
private final Cigar cigar;
private final int nCigarElements;
private int currentCigarElementOffset = -1;
/**
* how far are we offset from the start of the read bases?
*/
private int readOffset;
/**
* how far are we offset from the alignment start on the genome?
*/
private int genomeOffset;
/**
* Our cigar element
*/
private CigarElement currentElement;
/**
* how far are we into our cigarElement?
*/
private int offsetIntoCurrentCigarElement;
public AlignmentStateMachine(final GATKSAMRecord read) {
this.read = read;
this.cigar = read.getCigar();
this.nCigarElements = cigar.numCigarElements();
initializeAsLeftEdge();
}
/**
* Initialize the state variables to put this machine one bp before the
* start of the alignment, so that a call to stepForwardOnGenome() will advance
* us to the first proper location
*/
private void initializeAsLeftEdge() {
readOffset = offsetIntoCurrentCigarElement = genomeOffset = -1;
currentElement = null;
}
/**
* Get the read we are aligning to the genome
* @return a non-null GATKSAMRecord
*/
public GATKSAMRecord getRead() {
return read;
}
/**
* Get the reference index of the underlying read
*
* @return the reference index of the read
*/
public int getReferenceIndex() {
return getRead().getReferenceIndex();
}
/**
* Is this the left edge state? I.e., one that is before or after the current read?
* @return true if this state is an edge state, false otherwise
*/
public boolean isLeftEdge() {
return readOffset == -1;
}
/**
* Are we on the right edge? I.e., is the current state off the right of the alignment?
* @return true if off the right edge, false if otherwise
*/
public boolean isRightEdge() {
return readOffset == read.getReadLength();
}
/**
* What is our current offset in the read's bases that aligns us with the reference genome?
*
* @return the current read offset position. If an edge will be == -1
*/
public int getReadOffset() {
return readOffset;
}
/**
* Get the cigar element we're currently aligning with.
*
* For example, if the cigar string is 2M2D2M and we're in the second step of the
* first 2M, then this function returns the element 2M. After calling stepForwardOnGenome
* this function would return 2D.
*
* @return the cigar element, or null if we're the left edge
*/
public CigarElement getCurrentCigarElement() {
return currentElement;
}
/**
* Get the offset of the current cigar element among all cigar elements in the read
*
* Suppose our read's cigar is 1M2D3M, and we're at the first 1M. This would
* return 0. Stepping forward puts us in the 2D, so our offset is 1. Another
* step forward would result in a 1 again (we're in the second position of the 2D).
* Finally, one more step forward brings us to 2 (for the 3M element)
*
* @return the offset of the current cigar element in the reads's cigar. Will return -1 for
* when the state is on the left edge, and be == the number of cigar elements in the
* read when we're past the last position on the genome
*/
public int getCurrentCigarElementOffset() {
return currentCigarElementOffset;
}
/**
* Get the offset of the current state into the current cigar element
*
* That is, suppose we have a read with cigar 2M3D4M, and we're right at
* the second M position. offsetIntoCurrentCigarElement would be 1, as
* it's two elements into the 2M cigar. Now stepping forward we'd be
* in cigar element 3D, and our offsetIntoCurrentCigarElement would be 0.
*
* @return the offset (from 0) of the current state in the current cigar element.
* Will be 0 on the right edge, and -1 on the left.
*/
public int getOffsetIntoCurrentCigarElement() {
return offsetIntoCurrentCigarElement;
}
/**
* Convenience accessor of the CigarOperator of the current cigar element
*
* Robust to the case where we're on the edge, and currentElement is null, in which
* case this function returns null as well
*
* @return null if this is an edge state
*/
public CigarOperator getCigarOperator() {
return currentElement == null ? null : currentElement.getOperator();
}
@Override
public String toString() {
return String.format("%s ro=%d go=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, offsetIntoCurrentCigarElement, currentElement);
}
// -----------------------------------------------------------------------------------------------
//
// Code for setting up prev / next states
//
// -----------------------------------------------------------------------------------------------
/**
* Step the state machine forward one unit
*
* Takes the current state of this machine, and advances the state until the next on-genome
* cigar element (M, X, =, D) is encountered, at which point this function returns with the
* cigar operator of the current element.
*
* Assumes that the AlignmentStateMachine is in the left edge state at the start, so that
* stepForwardOnGenome() can be called to move the machine to the first alignment position. That
* is, the normal use of this code is:
*
* AlignmentStateMachine machine = new AlignmentStateMachine(read)
* machine.stepForwardOnGenome()
* // now the machine is at the first position on the genome
*
* When stepForwardOnGenome() advances off the right edge of the read, the state machine is
* left in a state such that isRightEdge() returns true and returns null, indicating the
* the machine cannot advance further. The machine may explode, though this is not contracted,
* if stepForwardOnGenome() is called after a previous call returned null.
*
* @return the operator of the cigar element that machine stopped at, null if we advanced off the end of the read
*/
public CigarOperator stepForwardOnGenome() {
// loop until we either find a cigar element step that moves us one base on the genome, or we run
// out of cigar elements
while ( true ) {
// we enter this method with readOffset = index of the last processed base on the read
// (-1 if we did not process a single base yet); this can be last matching base,
// or last base of an insertion
if (currentElement == null || (offsetIntoCurrentCigarElement + 1) >= currentElement.getLength()) {
currentCigarElementOffset++;
if (currentCigarElementOffset < nCigarElements) {
currentElement = cigar.getCigarElement(currentCigarElementOffset);
offsetIntoCurrentCigarElement = -1;
// next line: guards against cigar elements of length 0; when new cigar element is retrieved,
// we reenter in order to re-check offsetIntoCurrentCigarElement against currentElement's length
continue;
} else {
if (currentElement != null && currentElement.getOperator() == CigarOperator.D)
throw new UserException(read + "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
// we're done, so set the offset of the cigar to 0 for cleanliness, as well as the current element
offsetIntoCurrentCigarElement = 0;
readOffset = read.getReadLength();
currentElement = null;
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
// we fall into this else block only when indels end the read, increment genomeOffset such that the
// current offset of this read is the next ref base after the end of the indel. This position will
// model a point on the reference somewhere after the end of the read.
genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
// we do step forward on the ref, and by returning null we also indicate that we are past the read end.
return null;
}
}
offsetIntoCurrentCigarElement++;
boolean done = false;
switch (currentElement.getOperator()) {
case H: // ignore hard clips
case P: // ignore pads
offsetIntoCurrentCigarElement = currentElement.getLength();
break;
case I: // insertion w.r.t. the reference
case S: // soft clip
offsetIntoCurrentCigarElement = currentElement.getLength();
readOffset += currentElement.getLength();
break;
case D: // deletion w.r.t. the reference
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
throw new UserException(read + "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
// should be the same as N case
genomeOffset++;
done = true;
break;
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
genomeOffset++;
done = true;
break;
case M:
case EQ:
case X:
readOffset++;
genomeOffset++;
done = true;
break;
default:
throw new IllegalStateException("Case statement didn't deal with cigar op: " + currentElement.getOperator());
}
if ( done )
return currentElement.getOperator();
}
}
/**
* Create a new PileupElement based on the current state of this element
*
* Must not be a left or right edge
*
* @return a pileup element
*/
public final PileupElement makePileupElement() {
if ( isLeftEdge() || isRightEdge() )
throw new IllegalStateException("Cannot make a pileup element from an edge alignment state");
return new PileupElement(read,
getReadOffset(),
getCurrentCigarElement(),
getCurrentCigarElementOffset(),
getOffsetIntoCurrentCigarElement());
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/data/sam/GATKSAMReadGroupRecord.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.data.sam;
import htsjdk.samtools.SAMReadGroupRecord;
import org.ncic.bioinfo.sparkseq.algorithms.utils.NGSPlatform;
/**
* Author: wbc
*/
public class GATKSAMReadGroupRecord extends SAMReadGroupRecord {
// the SAMReadGroupRecord data we're caching
private String mSample = null;
private String mPlatform = null;
private NGSPlatform mNGSPlatform = null;
// because some values can be null, we don't want to duplicate effort
private boolean retrievedSample = false;
private boolean retrievedPlatform = false;
private boolean retrievedNGSPlatform = false;
public GATKSAMReadGroupRecord(final String id) {
super(id);
}
public GATKSAMReadGroupRecord(SAMReadGroupRecord record) {
super(record.getReadGroupId(), record);
}
/**
* Get the NGSPlatform enum telling us the platform of this read group
*
* This function call is caching, so subsequent calls to it are free, while
* the first time it's called there's a bit of work to resolve the enum
*
* @return an NGSPlatform enum value
*/
public NGSPlatform getNGSPlatform() {
if (!retrievedNGSPlatform) {
mNGSPlatform = NGSPlatform.fromReadGroupPL(getPlatform());
retrievedNGSPlatform = true;
}
return mNGSPlatform;
}
@Override
public String toString() {
return "GATKSAMReadGroupRecord @RG:" + getReadGroupId();
}
///////////////////////////////////////////////////////////////////////////////
// *** The following methods are overloaded to cache the appropriate data ***//
///////////////////////////////////////////////////////////////////////////////
@Override
public String getSample() {
if (!retrievedSample) {
mSample = super.getSample();
retrievedSample = true;
}
return mSample;
}
@Override
public void setSample(String s) {
super.setSample(s);
mSample = s;
retrievedSample = true;
}
@Override
public String getPlatform() {
if (!retrievedPlatform) {
mPlatform = super.getPlatform();
retrievedPlatform = true;
}
return mPlatform;
}
@Override
public void setPlatform(String s) {
super.setPlatform(s);
mPlatform = s;
retrievedPlatform = true;
retrievedNGSPlatform = false; // recalculate the NGSPlatform
}
}
================================================
FILE: src/main/java/org/ncic/bioinfo/sparkseq/algorithms/data/sam/GATKSAMRecord.java
================================================
/*
* Copyright (c) 2017 NCIC, Institute of Computing Technology, Chinese Academy of Sciences
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.ncic.bioinfo.sparkseq.algorithms.data.sam;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMUtils;
import org.ncic.bioinfo.sparkseq.algorithms.utils.ReadUtils;
import org.ncic.bioinfo.sparkseq.exceptions.ReviewedGATKException;
import org.ncic.bioinfo.sparkseq.algorithms.utils.EventType;
import org.ncic.bioinfo.sparkseq.algorithms.utils.NGSPlatform;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Author: wbc
*/
public class GATKSAMRecord extends SAMRecord implements Cloneable{
// Base Quality Score Recalibrator specific attribute tags
public static final String BQSR_BASE_INSERTION_QUALITIES = "BI"; // base qualities for insertions
public static final String BQSR_BASE_DELETION_QUALITIES = "BD"; // base qualities for deletions
/**
* The default quality score for an insertion or deletion, if
* none are provided for this read.
*/
public static final byte DEFAULT_INSERTION_DELETION_QUAL = (byte)45;
// the SAMRecord data we're caching
private String mReadString = null;
private GATKSAMReadGroupRecord mReadGroup = null;
private final static int UNINITIALIZED = -1;
private int softStart = UNINITIALIZED;
private int softEnd = UNINITIALIZED;
private Integer adapterBoundary = null;
private boolean isStrandlessRead = false;
// because some values can be null, we don't want to duplicate effort
private boolean retrievedReadGroup = false;
// These temporary attributes were added here to make life easier for
// certain algorithms by providing a way to label or attach arbitrary data to
// individual GATKSAMRecords.
// These attributes exist in memory only, and are never written to disk.
private Map